Graduate School/Mathematics for AI
Markov Decision Process Example
- -
728x90
반응형
MDP example¶
State Value Function Changes in Policy Iterations¶
Import Library¶
In [1]:
import numpy as np
Grid World¶
In [2]:
BOARD_ROWS = 3 # grid world 세로
BOARD_COLS = 3 # grid world 가로
GAMMA = 1.0
POSSIBLE_ACTIONS = [0, 1, 2, 3] # 좌, 우, 상, 하
ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동
REWARDS = []
Environment¶
In [3]:
class Env:
def __init__(self):
self.height = BOARD_ROWS
self.width = BOARD_COLS
self.reward = [[0] * BOARD_ROWS for _ in range(BOARD_COLS)]
self.reward[2][2] = 1.0 # (2,2) 도착점에 보상 1
self.reward[0][2] = -0.1 # (0,2) 함정에 보상 -0.1
self.reward[2][0] = -0.1 # (2,0) 함정에 보상 -0.1
self.possible_actions = POSSIBLE_ACTIONS
self.all_state = []
for y in range(BOARD_ROWS):
for x in range(BOARD_COLS):
state = [x, y]
self.all_state.append(state)
def check_boundary(self, state):
state[0] = (0 if state[0] < 0 else BOARD_COLS-1 if state[0] > BOARD_COLS-1 else state[0])
state[1] = (0 if state[1] < 0 else BOARD_ROWS-1 if state[1] > BOARD_ROWS-1 else state[1])
return state
def state_after_action(self, state, action_index):
action = ACTIONS[action_index]
return self.check_boundary([state[0]+action[0], state[1]+action[1]])
def get_reward(self, state, action):
next_state = self.state_after_action(state, action)
return self.reward[next_state[0]][next_state[1]]
def get_all_states(self):
return self.all_state
Policy Iteration¶
In [4]:
class PolicyIteration:
def __init__(self, env):
self.env = env # 환경에 대한 객체 선언
self.value_table = [[0.0]*env.height for _ in range(env.width)] # 가치함수를 2차원 리스트로 초기화
self.policy_table = [[[0.25, 0.25, 0.25, 0.25]]*env.width for _ in range(env.height)] # 각 방향 동일 확률
self.discount_factor = GAMMA # 할인율
def policy_evaluation(self): # 벨만 기대 방정식을 통해 다음 가치함수를 계산하는 정책 평가
next_value_table = [[0.0] * self.env.width for _ in range(self.env.height)] # 다음 가치함수 초기화
for state in self.env.get_all_states(): # 모든 상태에 대해서 벨만 기대방정식을 계산
value = 0.0
# 벨만 기대 방정식
for action in self.env.possible_actions:
next_state = self.env.state_after_action(state, action)
reward = self.env.get_reward(state, action)
next_value = self.get_value(next_state)
value += (self.get_policy(state)[action]*(reward+self.discount_factor*next_value))
next_value_table[state[0]][state[1]] = value
self.value_table = next_value_table
def policy_improvement(self): # 현재 가치 함수에 대해서 탐욕 정책 발전
next_policy = self.policy_table
for state in self.env.get_all_states():
#if state == [2, 2]: # 마침 상태
# continue
value_list = []
result = [0.0, 0.0, 0.0, 0.0] # 반환할 정책 초기화
# 모든 행동에 대해서 [보상 + (할인율 * 다음 상태 가치함수)] 계산
for index, action in enumerate(self.env.possible_actions):
next_state = self.env.state_after_action(state, action)
reward = self.env.get_reward(state, action)
next_value = self.get_value(next_state)
value = reward+self.discount_factor*next_value
value_list.append(value)
# 받을 보상이 최대인 행동들에 대해 탐욕 정책 발전
max_idx_list = np.argwhere(value_list == np.amax(value_list))
max_idx_list = max_idx_list.flatten().tolist()
prob = 1.0/len(max_idx_list)
for idx in max_idx_list:
result[idx] = prob
next_policy[state[0]][state[1]] = result
self.policy_table = next_policy
def get_action(self, state): # 특정 상태에서 정책에 따라 무작위로 행동을 반환
policy = np.array(self.get_policy(state))
return np.random.choice(4, 1, p=policy)[0]
def get_policy(self, state): # 상태에 따른 정책 반환
return self.policy_table[state[0]][state[1]]
def get_value(self, state): # 가치 함수의 값을 반환
return self.value_table[state[0]][state[1]]
Show Value Table¶
In [5]:
def showValues(value_table):
for i in range(0, len(value_table)):
print('----------------------------')
out = '| '
for j in range(0, len(value_table[0])):
out += str(round(value_table[i][j], 3)).ljust(6) + ' | '
print(out)
print('----------------------------')
Run Iteration¶
In [6]:
env = Env()
policy_iteration = PolicyIteration(env)
for k in range(8):
print("k =", k+1)
policy_iteration.policy_evaluation()
showValues(policy_iteration.value_table)
print()
k = 1 ---------------------------- | 0.0 | -0.025 | -0.05 | ---------------------------- | -0.025 | 0.0 | 0.225 | ---------------------------- | -0.05 | 0.225 | 0.5 | ---------------------------- k = 2 ---------------------------- | -0.013 | -0.044 | -0.025 | ---------------------------- | -0.044 | 0.1 | 0.394 | ---------------------------- | -0.025 | 0.394 | 0.863 | ---------------------------- k = 3 ---------------------------- | -0.028 | -0.02 | 0.025 | ---------------------------- | -0.02 | 0.175 | 0.558 | ---------------------------- | 0.025 | 0.558 | 1.128 | ---------------------------- k = 4 ---------------------------- | -0.024 | 0.013 | 0.097 | ---------------------------- | 0.013 | 0.269 | 0.696 | ---------------------------- | 0.097 | 0.696 | 1.343 | ---------------------------- k = 5 ---------------------------- | -0.006 | 0.064 | 0.176 | ---------------------------- | 0.064 | 0.355 | 0.826 | ---------------------------- | 0.176 | 0.826 | 1.52 | ---------------------------- k = 6 ---------------------------- | 0.029 | 0.122 | 0.26 | ---------------------------- | 0.122 | 0.445 | 0.944 | ---------------------------- | 0.26 | 0.944 | 1.673 | ---------------------------- k = 7 ---------------------------- | 0.076 | 0.189 | 0.347 | ---------------------------- | 0.189 | 0.533 | 1.056 | ---------------------------- | 0.347 | 1.056 | 1.809 | ---------------------------- k = 8 ---------------------------- | 0.132 | 0.261 | 0.435 | ---------------------------- | 0.261 | 0.622 | 1.161 | ---------------------------- | 0.435 | 1.161 | 1.932 | ----------------------------
In [ ]:
728x90
반응형
'Graduate School > Mathematics for AI' 카테고리의 다른 글
Exploratory data analysis and visualization (0) | 2024.09.10 |
---|---|
IBM HR data Binary Classification (0) | 2024.09.10 |
Contents
소중한 공감 감사합니다