Graduate School/Topics on AI
Last Card Game
- -
728x90
반응형
1. Game rule
- There are 100 cards
- Two players
- player-0 (AI), player-1 (human)
- Player-turn sequence: player0-player1-player0-player1- ….
- Each player will draw up to 3 cards in his turn
- The player who draws the 100th card (the last card) wins!
2. Gameplay
- # of drawn cards = 0, Player-0 draws 3 cards
- # of drawn cards = 3, Player-1 draws 1 card
- # of drawn cards = 4, Player-0 draws 1 card
- # of drawn cards = 5, Player-1 draws 2 cards
- # of drawn cards = 7, Player-0 draws 3 cards
- Player-0 won! He took the last card
3. The Environment
- 𝑛: total number of cards (=100)
- State: 𝑠 ∈ {0,1,2, … , 𝑛}
- Total # of drawn cards
- == the recent-most card no.
- Action: 𝑎 ∈ {0,1,2}
- 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards
- Reward: your choice
4. Solution
4.1. using Q-Learning
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(300)
ep = 100 # number of epoch
epi = 20 # number of episode
score_board_before_train = []
score_board_in_train = []
score_board_after_train = []
class LastCard:
def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
self.n = n
self.states = [s for s in range(1, self.n+3)]
self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards
def reset(self):
self.s = 1 # choose n state randomly
self.state = self.states[self.s]
return self.s
def step(self, a):
action = self.actions[a]
state = self.state
if state == (1,1): # goal
reward = 0
done = True
new_state = 0
return reward, new_state, done
elif self.m.maze_map[state][action] == 0:
reward = -10
done= True
new_state = None
return reward, new_state, done
x, y = state
if action == 'E':
y += 1
elif action == 'W':
y -= 1
elif action == 'N':
x -= 1
elif action == 'S':
x += 1
self.state = new_state = (x, y)
self.s = s_new = self.states.index(self.state)
reward = -1 # each step, get -1
done = False
return reward, self.s, done
""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions
""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)
def run_episode(epoch, episode):
s = env.reset()
done = False
S, A, R = [s], [], []
r = 0
while not done:
# player-0 (AI)
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
r, s, done = env.step(a) # apply the action
if done:
break
A.append(a) # add to the record
R.append(r) # add to the record
# player-1 (random select action, without train)
a_prob_without_train = pi_without_train[s, :]
a = np.random.choice([0, 1, 2], p=a_prob_without_train)
_, s, done = env.step(a) # apply the action
if done:
S.append(None)
break
S.append(s) # add to the record
score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
return S, A, R
# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
s = env.reset()
done = False
while not done:
player = 0
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
player = 1
a_prob_without_train = pi_without_train[s, :]
a = np.random.choice([0, 1, 2], p=a_prob_without_train)
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
if r == 1: # player
print(f"====> player-{player} won!\n")
return 0
elif r == -1:
print(f"====> player-{1-player} won!\n")
return 1
def gameplay_human():
s = env.reset()
done = False
while not done:
player = 0
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
player = 1
a = int(input("Input draw cards (1/2/3): "))-1
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
if r == 1: # player
print(f"====> player-{player} won!")
elif r == -1:
print(f"====> player-{1-player} won!")
gamma = 0.99
"""
m = number of actions
set pi[s,a] = epsilon/m for all a
set pi[s,a] = epsilon/m + (1-epsilon) IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
m = len(actions)
pi[:, :] = epsilon/m
for s in range(len(states)):
a_max = np.argmax(q[s, :])
pi[s, a_max] += (1-epsilon)
return pi
"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def q_learning(epoch, alpha=0.1, episodes=20):
for episode in range(episodes):
# gather/sample transitions
S, A, R = run_episode(epoch, episode+1)
for i in range(len(A)):
s = S[i]
a = A[i]
r = R[i]
s_next = S[i+1]
if s_next is None:
q[s, a] = -100000
else:
q_target = q_new = r+gamma*np.max(q[s_next, :])
q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")
dq_min = 0.01
def train():
global pi
for epoch in range(ep):
q_old = q.copy()
q_learning(epoch+1, episodes=epi)
dq = np.mean(np.abs(q_old-q))
if dq<dq_min:
break
pi = epsilon_greedy(q)
def plot_ai_score(score_board, play_mode):
player_0_wins = []
for epoch in range(0, ep*epi, epi):
score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
player_1_win_count = np.count_nonzero(score_stats_epoch)
player_0_win_count = epi-player_1_win_count
player_0_wins.append(player_0_win_count)
plt.figure(figsize=(12, 4))
plt.plot(range(1, ep+1), player_0_wins, 'b-')
plt.title("[Q-Learning] AI Player(player-0)'s Score"+" / "+play_mode)
plt.xlabel("number of epoch")
plt.ylabel("number of wins")
plt.tight_layout()
plt.show()
# game play before train
for epoch in range(ep):
print(f"++++++++++++++++")
print(f"+ epoch : {epoch+1:4d} +")
print(f"++++++++++++++++")
for episode in range(epi):
print(f"======== episode : {episode+1:3d} ========")
score = gameplay_auto()
score_board_before_train.append([epoch+1, episode, score])
print()
print()
# train
train()
pi = epsilon_greedy(q, 0.0)
# game play after train
for epoch in range(ep):
print(f"++++++++++++++++")
print(f"+ epoch : {epoch+1:4d} +")
print(f"++++++++++++++++")
for episode in range(epi):
print(f"======== episode : {episode+1:3d} ========")
score = gameplay_auto()
score_board_after_train.append([epoch+1, episode, score])
print()
print()
# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")
# enjoy game!
gameplay_human()
4.2. using SARSA
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(300)
ep = 100 # number of epoch
epi = 20 # number of episode
score_board_before_train = []
score_board_in_train = []
score_board_after_train = []
class LastCard:
def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
self.n = n
self.states = [s for s in range(1, self.n+3)]
self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards
def reset(self):
self.s = 1 # choose n state randomly
self.state = self.states[self.s]
return self.s
def step(self, a):
action = self.actions[a]
state = self.state
if state+action == self.n+1: # goal
reward = 1
done = True
new_state = 0
return reward, new_state, done
elif state+action > self.n+1:
reward = -1
done = True
new_state = None
return reward, new_state, done
last_card = state
if action == 0:
last_card += 1
elif action == 1:
last_card += 2
elif action == 2:
last_card += 3
self.state = new_state = last_card
self.s = s_new = self.states.index(self.state)
reward = -1 # each step, get -1
done = False
return reward, self.s, done
""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions
""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)
def run_episode(epoch, episode):
s = env.reset()
done = False
S, A, R = [s], [], []
r = 0
while not done:
# player-0 (AI)
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
r, s, done = env.step(a) # apply the action
if done:
break
A.append(a) # add to the record
R.append(r) # add to the record
# player-1 (random select action, without train)
a_prob_without_train = pi_without_train[s, :]
a = np.random.choice([0, 1, 2], p=a_prob_without_train)
_, s, done = env.step(a) # apply the action
if done:
S.append(None)
break
S.append(s) # add to the record
score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
return S, A, R
# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
s = env.reset()
done = False
while not done:
player = 0
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
player = 1
a_prob_without_train = pi_without_train[s, :]
a = np.random.choice([0, 1, 2], p=a_prob_without_train)
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
if r == 1: # player
print(f"====> player-{player} won!\n")
return 0
elif r == -1:
print(f"====> player-{1-player} won!\n")
return 1
def gameplay_human():
s = env.reset()
done = False
while not done:
player = 0
a_prob = pi[s, :]
a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
player = 1
a = int(input("Input draw cards (1/2/3): "))-1
print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
r, s, done = env.step(a) # apply the action
if done:
break
if r == 1: # player
print(f"====> player-{player} won!")
elif r == -1:
print(f"====> player-{1-player} won!")
gamma = 0.99
"""
m = number of actions
set pi[s,a] = epsilon/m for all a
set pi[s,a] = epsilon/m + (1-epsilon) IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
m = len(actions)
pi[:, :] = epsilon/m
for s in range(len(states)):
a_max = np.argmax(q[s, :])
pi[s, a_max] += (1-epsilon)
return pi
"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def sarsa(epoch, alpha=0.1, episodes=20):
for episode in range(episodes):
# gather/sample transitions
S, A, R = run_episode(epoch, episode+1)
for i in range(len(A)):
s = S[i]
a = A[i]
r = R[i]
s_next = S[i+1]
a_next = np.argmax(epsilon_greedy(q)[s_next, :])
if s_next is None:
q[s, a] = -100000
else:
q_target = q_new = r+gamma*q[s_next, a_next]
q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")
dq_min = 0.01
def train():
global pi
for epoch in range(ep):
q_old = q.copy()
sarsa(epoch+1, episodes=epi)
dq = np.mean(np.abs(q_old-q))
if dq<dq_min:
break
pi = epsilon_greedy(q)
def plot_ai_score(score_board, play_mode):
player_0_wins = []
for epoch in range(0, ep*epi, epi):
score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
player_1_win_count = np.count_nonzero(score_stats_epoch)
player_0_win_count = epi-player_1_win_count
player_0_wins.append(player_0_win_count)
plt.figure(figsize=(12, 4))
plt.plot(range(1, ep+1), player_0_wins, 'b-')
plt.title("[SARSA] AI Player(player-0)'s Score"+" / "+play_mode)
plt.xlabel("number of epoch")
plt.ylabel("number of wins")
plt.tight_layout()
plt.show()
# game play before train
for epoch in range(ep):
print(f"++++++++++++++++")
print(f"+ epoch : {epoch+1:4d} +")
print(f"++++++++++++++++")
for episode in range(epi):
print(f"======== episode : {episode+1:3d} ========")
score = gameplay_auto()
score_board_before_train.append([epoch+1, episode, score])
print()
print()
# train
train()
pi = epsilon_greedy(q, 0.0)
# game play after train
for epoch in range(ep):
print(f"++++++++++++++++")
print(f"+ epoch : {epoch+1:4d} +")
print(f"++++++++++++++++")
for episode in range(epi):
print(f"======== episode : {episode+1:3d} ========")
score = gameplay_auto()
score_board_after_train.append([epoch+1, episode, score])
print()
print()
# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")
# enjoy game!
gameplay_human()
728x90
반응형
'Graduate School > Topics on AI' 카테고리의 다른 글
Mine Sweeper Game (0) | 2024.09.10 |
---|---|
Solving Maze using Reinforcement Learning (0) | 2024.09.10 |
AutoEncoder Implementation (0) | 2024.09.10 |
Contents
소중한 공감 감사합니다