import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import random
sns.set_style('darkgrid')
%matplotlib inline
gamma = 1
reward_size = -1
grid_size = 4
termination_states = [[0,0], grid_size-1, grid_size-1]
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]]
num_iterations = 1000
def action_reward_function(initial_position, action):
if initial_position in termination_states:
return initial_position, 0
reward = reward_size
final_position = np.array(initial_position) + np.array(action)
if (-1 in final_position) or (4 in final_position):
final_position = initial_position
return final_position, reward
value_map = np.zeros((grid_size, grid_size))
states = [[i, j] for i in range(grid_size) for j in range(grid_size)]
# Step=0: value_map
value_map
deltas = []
for iter in range(num_iterations):
copy_value_map = value_map.copy()
delta_state = []
for state in states:
weighted_rewards = 0
for action in actions:
final_position, reward = action_reward_function(state, action)
weighted_rewards += (1/len(actions)*(reward+(gamma*value_map[final_position[0], final_position[1]])))
delta_state.append(np.abs(copy_value_map[state[0], state[1]] - weighted_rewards))
copy_value_map[state[0], state[1]] = weighted_rewards
deltas.append(delta_state)
value_map = copy_value_map
if iter in [0,1,2,9,99, num_iterations-1]:
print('Iteration:{}'.format(iter+1))
print(value_map)
print('deltas shape:', np.shape(deltas))
plt.figure(figsize=(20,10))
plt.plot(deltas);
gamma = 0.6
reward_size = -1
grid_size = 4
terminaton_states = [[0,0], [grid_size-1, grid_size-1]]
actions = [[-1,0], [1,0], [0,1], [0,-1]]
num_iterations = 1000
V = np.zeros((grid_size, grid_size))
returns = {(i,j): list() for i in range(grid_size) for j in range(grid_size)}
deltas = {(i,j): list() for i in range(grid_size) for j in range(grid_size)}
states = [[i,j] for i in range(grid_size) for j in range(grid_size)]
def generate_episode():
init_state = random.choice(states[1:-1])
episode = []
while True:
if list(init_state) in termination_states:
return episode
action = random.choice(actions)
final_state = np.array(init_state) + np.array(action)
if (-1 in list(final_state)) or (grid_size in list(final_state)):
final_state = init_state
episode.append([list(init_state), action, reward_size, list(final_state)])
init_state = final_state
for iter in tqdm(range(num_iterations)):
episode = generate_episode()
G = 0
for i, step in enumerate(episode[::-1]):
G = gamma*G + step[2]
if step[0] not in [x[0] for x in episode[::-1][len(episode)-i:]]:
idx = (step[0][0], step[0][1])
returns[idx].append(G)
new_value = np.average(returns[idx])
deltas[idx[0], idx[1]].append(np.abs(V[idx[0], idx[1]]-new_value))
V[idx[0], idx[1]] = new_value
V
# gamma = 1
plt.figure(figsize=(20,10))
all_series = [list(x)[:50] for x in deltas.values()]
for series in all_series:
plt.plot(series)
# gamma = 0.6
plt.figure(figsize=(20,10))
all_series = [list(x)[:50] for x in deltas.values()]
for series in all_series:
plt.plot(series)
gamma = 0.1
reward_size = -1
grid_size = 4
alpha = 0.1 # step_size
termination_states = [[0,0], [grid_size-1, grid_size-1]]
actions = [[-1,0],[1,0],[0,1],[0,-1]]
num_iterations = 10000
V = np.zeros((grid_size, grid_size))
returns = {(i,j):list() for i in range(grid_size) for j in range(grid_size)}
deltas = {(i,j):list() for i in range(grid_size) for j in range(grid_size)}
states = [[i,j] for i in range(grid_size) for j in range(grid_size)]
def generate_initial_state():
init_state = random.choice(states[1:-1])
return init_state
def generate_next_action():
return random.choice(actions)
def take_action(state, action):
if list(state) in termination_states:
return 0, None
final_state = np.array(state) + np.array(action)
if (-1 in list(final_state)) or (grid_size in list(final_state)):
final_state = state
return reward_size, list(final_state)
for iter in tqdm(range(num_iterations)):
state = generate_initial_state()
while True:
action = generate_next_action()
reward, final_state = take_action(state, action)
if final_state is None:
break
before = V[state[0], state[1]]
V[state[0], state[1]] += alpha*(reward+gamma*V[final_state[0], final_state[1]]-V[state[0], state[1]])
deltas[state[0], state[1]].append(float(np.abs(before-V[state[0], state[1]])))
state = final_state
V
# gamma = 0.5 alpha = 0.5
# Not convergent
plt.figure(figsize=(20,10))
all_series = [list(x)[:50] for x in deltas.values()]
for series in all_series:
plt.plot(series)
# gamma = 0.1 alpha = 0.1
# Convergent
plt.figure(figsize=(20,10))
all_series = [list(x)[:50] for x in deltas.values()]
for series in all_series:
plt.plot(series)