|
|
|
|
|
|
∀ s ∈ S, ∀ a ∈ A \(\sum\)s\('\) ∈ S Pr(s\('\) | s, a) = 1
|
|
V = r0 + γr1 + γ2r2 + \(\cdots\) = \(\sum\)∞ γiri
Qπ(s, a) = \(\sum\)s' Pr(s' | s, a)(R(s, a, s') + γVπ(s')) Vπ(s) = Qπ(s, π(s))
Q*(s, a) = \(\sum\)s' Pr(s' | s, a)(R(s, a, s') + γV*(s'))
V*(s) = maxa Q*(s, a)
π*(s) = a | ∀ a\('\) ∈ A : Q*(s, a) ≥ Q*(s, a\('\))
value iteration(S, A, Pr, R, θ) | |||
V0[*] ← 0 | |||
k ← 0 | |||
repeat | |||
k++ | |||
for each s ∈ S do | |||
Vk[s] ← maxa \(\sum\)n Pr(n | s, a)(R(s, a, n) + γVk - 1[n]) | |||
until ∀s |Vk[s] - Vk - 1[s]| < θ | |||
for each s ∈ S do | |||
π[s] ← a maximizing \(\sum\)n Pr(n | s, a)(R(s, a, n) + γVk[n]) | |||
return π, Vk |
0 = c10 + c11Vπ(s1) + \(\cdots\) + c1nVπ(sn) \(\vdots\) 0 = cn0 + cn1Vπ(s1) + \(\cdots\) + cnnVπ(sn)
policy iteration(S, A, Pr, R) | |||||
π[*] ← whatever | |||||
repeat | |||||
unchanged ← true | |||||
V ← the solution to the linear equations given by π | |||||
for each s ∈ S do | |||||
Qbest ← V[s] | |||||
for each a ∈ A do | |||||
Qsa ← \(\sum\)n Pr(n | s, a)(R(s, a, n) + γV[n]) | |||||
if Qsa > Qbest | |||||
π[s] ← a | |||||
Qbest ← Qsa | |||||
unchanged ← false | |||||
until unchanged | |||||
return π |