rlcode
diff --git a/‎1-grid-world/1-policy-iteration/environment.py
Lines changed: 1 addition & 3 deletions b/‎1-grid-world/1-policy-iteration/environment.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎1-grid-world/1-policy-iteration/policy_iteration.py
Lines changed: 10 additions & 14 deletions b/‎1-grid-world/1-policy-iteration/policy_iteration.py
Lines changed: 10 additions & 14 deletions
diff --git a/‎1-grid-world/2-value-iteration/value_iteration.py
Lines changed: 5 additions & 9 deletions b/‎1-grid-world/2-value-iteration/value_iteration.py
Lines changed: 5 additions & 9 deletions
diff --git a/‎1-grid-world/4-sarsa/sarsa_agent.py
Lines changed: 1 addition & 1 deletion b/‎1-grid-world/4-sarsa/sarsa_agent.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎1-grid-world/5-q-learning/q_learning_agent.py
Lines changed: 4 additions & 4 deletions b/‎1-grid-world/5-q-learning/q_learning_agent.py
Lines changed: 4 additions & 4 deletions
@@ -75,8 +75,6 @@ def _build_canvas(self):
 
         return canvas
 
-    # (rectangle, triangle1, triangle2, circle)
-
     def load_images(self):
         up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
         right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
@@ -99,7 +97,7 @@ def reset(self):
             self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
             self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH
                                         for _ in range(HEIGHT)])
-            self.policy_table[2][2] = []
+            self.agent.policy_table[2][2] = []
             x, y = self.canvas.coords(self.rectangle)
             self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
 
 
@@ -2,8 +2,6 @@
 import random
 from environment import GraphicDisplay, Env
 
-DISCOUNT_FACTOR = 0.9
-
 
 class PolicyIteration:
     def __init__(self, env):
@@ -15,6 +13,7 @@ def __init__(self, env):
                                     for _ in range(env.height)]
         # setting terminal state
         self.policy_table[2][2] = []
+        self.discount_factor = 0.9
 
     def policy_evaluation(self):
         next_value_table = [[0.00] * self.env.width
@@ -32,8 +31,8 @@ def policy_evaluation(self):
                 next_state = self.env.state_after_action(state, action)
                 reward = self.env.get_reward(state, action)
                 next_value = self.get_value(next_state)
-                value += self.get_policy(state, action) * \
-                        (reward + DISCOUNT_FACTOR * next_value)
+                value += (self.get_policy(state)[action] *
+                          (reward + self.discount_factor * next_value))
 
             next_value_table[state[0]][state[1]] = round(value, 2)
 
@@ -46,15 +45,15 @@ def policy_improvement(self):
                 continue
             value = -99999
             max_index = []
-            result = [0.0, 0.0, 0.0, 0.0] # initialize the policy
+            result = [0.0, 0.0, 0.0, 0.0]  # initialize the policy
 
             # for every actions, calculate
             # [reward + (discount factor) * (next state value function)]
             for index, action in enumerate(self.env.possible_actions):
                 next_state = self.env.state_after_action(state, action)
                 reward = self.env.get_reward(state, action)
                 next_value = self.get_value(next_state)
-                temp = reward + DISCOUNT_FACTOR * next_value
+                temp = reward + self.discount_factor * next_value
 
                 # We normally can't pick multiple actions in greedy policy.
                 # but here we allow multiple actions with same max values
@@ -75,6 +74,7 @@ def policy_improvement(self):
 
         self.policy_table = next_policy
 
+    # get action according to the current policy
     def get_action(self, state):
         random_pick = random.randrange(100) / 100
 
@@ -84,21 +84,17 @@ def get_action(self, state):
         for index, value in enumerate(policy):
             policy_sum += value
             if random_pick < policy_sum:
-                return self.env.possible_actions[index]
+                return index
 
-    def get_policy(self, state, action=None):
-        # if no action is given, then return the probabilities of all actions
-        if action is None:
-            return self.policy_table[state[0]][state[1]]
+    # get policy of specific state
+    def get_policy(self, state):
         if state == [2, 2]:
             return 0.0
-        action_index = self.env.possible_actions.index(action)
-        return self.policy_table[state[0]][state[1]][action_index]
+        return self.policy_table[state[0]][state[1]]
 
     def get_value(self, state):
         return round(self.value_table[state[0]][state[1]], 2)
 
-
 if __name__ == "__main__":
     env = Env()
     policy_iteration = PolicyIteration(env)
 
@@ -1,20 +1,17 @@
 # -*- coding: utf-8 -*-
-import random
 from environment import GraphicDisplay, Env
 
-DISCOUNT_FACTOR = 0.9
-
-
 class ValueIteration:
     def __init__(self, env):
         self.env = env
         # 2-d list for the value function
         self.value_table = [[0.0] * env.width for _ in range(env.height)]
+        self.discount_factor = 0.9
 
     # get next value function table from the current value function table
     def value_iteration(self):
-        next_value_table = [[0.0] * \
-                            self.env.width for _ in range(self.env.height)]
+        next_value_table = [[0.0] * self.env.width for _ in
+                            range(self.env.height)]
         for state in self.env.get_all_states():
             if state == [2, 2]:
                 next_value_table[state[0]][state[1]] = 0.0
@@ -25,14 +22,13 @@ def value_iteration(self):
                 next_state = self.env.state_after_action(state, action)
                 reward = self.env.get_reward(state, action)
                 next_value = self.get_value(next_state)
-                value_list.append((reward + DISCOUNT_FACTOR * next_value))
+                value_list.append((reward + self.discount_factor * next_value))
             # return the maximum value(it is the optimality equation!!)
             next_value_table[state[0]][state[1]] = round(max(value_list), 2)
         self.value_table = next_value_table
 
     # get action according to the current value function table
     def get_action(self, state):
-
         action_list = []
         max_value = -99999
 
@@ -46,7 +42,7 @@ def get_action(self, state):
             next_state = self.env.state_after_action(state, action)
             reward = self.env.get_reward(state, action)
             next_value = self.get_value(next_state)
-            value = (reward + DISCOUNT_FACTOR * next_value)
+            value = (reward + self.discount_factor * next_value)
 
             if value > max_value:
                 action_list.clear()
 
@@ -11,7 +11,7 @@ def __init__(self, actions):
         self.actions = actions
         self.learning_rate = 0.01
         self.discount_factor = 0.9
-        self.epsilon = 0.9
+        self.epsilon = 0.1
         self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
 
     # with sample <s, a, r, s', a'>, learns new q function
 
@@ -9,15 +9,15 @@ def __init__(self, actions):
         self.actions = actions
         self.learning_rate = 0.01
         self.discount_factor = 0.9
-        self.epsilon = 0.9
+        self.epsilon = 0.1
         self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
 
     # update q function with sample <s, a, r, s'>
     def learn(self, state, action, reward, next_state):
-        q_1 = self.q_table[state][action]
+        current_q = self.q_table[state][action]
         # using Bellman Optimality Equation to update q function
-        q_2 = reward + self.discount_factor * max(self.q_table[next_state])
-        self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
+        new_q = reward + self.discount_factor * max(self.q_table[next_state])
+        self.q_table[state][action] += self.learning_rate * (current_q - new_q)
 
     # get action for the state according to the q function table
     # agent pick action of epsilon-greedy policy