PROGRAM-10
Implement a Q-learning algorithm to navigate a simple grid environment, defining the reward
structure and analyzing agent performance.
import numpy as np
# Define the environment
n_states = 16 # Number of states in the grid world
n_actions = 4 # Number of possible actions (up, down, left, right)
goal_state = 15 # Goal state
# Initialize Q-table with zeros
Q_table = np.zeros((n_states, n_actions))
# Define parameters
learning_rate = 0.8
discount_factor = 0.95
exploration_prob = 0.2
epochs = 1000
# Q-learning algorithm
for epoch in range(epochs):
current_state = np.random.randint(0, n_states) # Start from a random state
while current_state != goal_state:
# Choose action with epsilon-greedy strategy
if np.random.rand() < exploration_prob:
action = np.random.randint(0, n_actions) # Explore
else:
action = np.argmax(Q_table[current_state]) # Exploit
# Simulate the environment (move to the next state)
# For simplicity, move to the next state
next_state = (current_state + 1) % n_states
# Define a simple reward function (1 if the goal state is reached, 0 otherwise)
reward = 1 if next_state == goal_state else 0
# Update Q-value using the Q-learning update rule
Q_table[current_state, action] += learning_rate * \
(reward + discount_factor *
np.max(Q_table[next_state]) - Q_table[current_state, action])
current_state = next_state # Move to the next state
# After training, the Q-table represents the learned Q-values
print("Learned Q-table:")
print(Q_table)
Learned Q-table:
[[0.48767498 0.48751892 0.48751892 0.46816798]
[0.51334208 0.51330923 0.51334207 0.50923535]
[0.54036009 0.5403255 0.54036003 0.5403587 ]
[0.56880009 0.56880009 0.56880008 0.56880009]
[0.59873694 0.59873694 0.59873694 0.59873694]
[0.63024941 0.63024941 0.63024941 0.63024941]
[0.66342043 0.66342043 0.66342043 0.66342043]
[0.6983373 0.6983373 0.6983373 0.6983373 ]
[0.73509189 0.73509189 0.73509189 0.73509189]
[0.77378094 0.77378094 0.77378094 0.77378094]
[0.81450625 0.81450625 0.81450625 0.81450625]
[0.857375 0.857375 0.857375 0.857375 ]
[0.9025 0.9025 0.9025 0.9025 ]
[0.95 0.95 0.95 0.95 ]
[1. 1. 1. 1. ]
[0. 0. 0. 0. ]]