Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 3195
1
#other strats.
2
# TODO: UBC strat, epsilon-greedy
3
4
import scipy.stats as stats
5
import numpy as np
6
7
rand = np.random.rand
8
beta = stats.beta
9
10
11
class GeneralBanditStrat(object):
12
13
"""
14
Implements a online, learning strategy to solve
15
the Multi-Armed Bandit problem.
16
17
parameters:
18
bandits: a Bandit class with .pull method
19
choice_function: accepts a self argument (which gives access to all the variables), and
20
returns and int between 0 and n-1
21
methods:
22
sample_bandits(n): sample and train on n pulls.
23
24
attributes:
25
N: the cumulative number of samples
26
choices: the historical choices as a (N,) array
27
bb_score: the historical score as a (N,) array
28
29
"""
30
31
def __init__(self, bandits, choice_function):
32
33
self.bandits = bandits
34
n_bandits = len(self.bandits)
35
self.wins = np.zeros(n_bandits)
36
self.trials = np.zeros(n_bandits)
37
self.N = 0
38
self.choices = []
39
self.score = []
40
self.choice_function = choice_function
41
42
def sample_bandits(self, n=1):
43
44
score = np.zeros(n)
45
choices = np.zeros(n)
46
47
for k in range(n):
48
#sample from the bandits's priors, and select the largest sample
49
choice = self.choice_function(self)
50
51
#sample the chosen bandit
52
result = self.bandits.pull(choice)
53
54
#update priors and score
55
self.wins[choice] += result
56
self.trials[choice] += 1
57
score[k] = result
58
self.N += 1
59
choices[k] = choice
60
61
self.score = np.r_[self.score, score]
62
self.choices = np.r_[self.choices, choices]
63
return
64
65
66
def bayesian_bandit_choice(self):
67
return np.argmax(np.random.beta(1 + self.wins, 1 + self.trials - self.wins))
68
69
def max_mean(self):
70
"""pick the bandit with the current best observed proportion of winning """
71
return np.argmax(self.wins / (self.trials +1))
72
73
def lower_credible_choice( self ):
74
"""pick the bandit with the best LOWER BOUND. See chapter 5"""
75
def lb(a,b):
76
return a/(a+b) - 1.65*np.sqrt((a*b)/( (a+b)**2*(a+b+1)))
77
a = self.wins + 1
78
b = self.trials - self.wins + 1
79
return np.argmax(lb(a,b))
80
81
def upper_credible_choice(self):
82
"""pick the bandit with the best LOWER BOUND. See chapter 5"""
83
def lb(a,b):
84
return a/(a+b) + 1.65*np.sqrt((a*b)/((a+b)**2*(a+b+1)))
85
a = self.wins + 1
86
b = self.trials - self.wins + 1
87
return np.argmax(lb(a,b))
88
89
def random_choice(self):
90
return np.random.randint(0, len(self.wins))
91
92
93
def ucb_bayes(self):
94
C = 0
95
n = 10000
96
alpha =1 - 1./((self.N+1))
97
return np.argmax(beta.ppf(alpha,
98
1 + self.wins,
99
1 + self.trials - self.wins))
100
101
102
103
104
class Bandits(object):
105
"""
106
This class represents N bandits machines.
107
108
parameters:
109
p_array: a (n,) Numpy array of probabilities >0, <1.
110
111
methods:
112
pull( i ): return the results, 0 or 1, of pulling
113
the ith bandit.
114
"""
115
def __init__(self, p_array):
116
self.p = p_array
117
self.optimal = np.argmax(p_array)
118
119
def pull(self, i):
120
#i is which arm to pull
121
return rand() < self.p[i]
122
123
def __len__(self):
124
return len(self.p)
125
126