Graduate School/Topics on AI

Last Card Game

728x90

1. Game rule

There are 100 cards
Two players
- player-0 (AI), player-1 (human)
- Player-turn sequence: player0-player1-player0-player1- ….
Each player will draw up to 3 cards in his turn
The player who draws the 100th card (the last card) wins!

2. Gameplay

# of drawn cards = 0, Player-0 draws 3 cards
# of drawn cards = 3, Player-1 draws 1 card
# of drawn cards = 4, Player-0 draws 1 card
# of drawn cards = 5, Player-1 draws 2 cards
# of drawn cards = 7, Player-0 draws 3 cards
Player-0 won! He took the last card

3. The Environment

𝑛: total number of cards (=100)
State: 𝑠 ∈ {0,1,2, … , 𝑛}
- Total # of drawn cards
- == the recent-most card no.
Action: 𝑎 ∈ {0,1,2}
- 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards
Reward: your choice

4. Solution

4.1. using Q-Learning

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(300)

ep = 100 # number of epoch
epi = 20 # number of episode

score_board_before_train = []
score_board_in_train = []
score_board_after_train = []


class LastCard:
    def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
        self.n = n
        self.states = [s for s in range(1, self.n+3)]
        self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards

    def reset(self):
        self.s = 1 # choose n state randomly
        self.state = self.states[self.s]
        return self.s

    def step(self, a):
        action = self.actions[a]
        state = self.state
        if state == (1,1): # goal
            reward = 0
            done = True
            new_state = 0
            return reward, new_state, done
        elif self.m.maze_map[state][action] == 0:
            reward = -10
            done= True
            new_state = None
            return reward, new_state, done

        x, y = state
        if action == 'E':
            y += 1
        elif action == 'W':
            y -= 1
        elif action == 'N':
            x -= 1
        elif action == 'S':
            x += 1
        self.state = new_state = (x, y)
        self.s = s_new = self.states.index(self.state)
        reward = -1 # each step, get -1
        done = False
        return reward, self.s, done


""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions

""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)


def run_episode(epoch, episode):
    s = env.reset()
    done = False
    S, A, R = [s], [], []
    r = 0
    while not done:
        # player-0 (AI)
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        r, s, done = env.step(a) # apply the action
        if done:
            break
        A.append(a) # add to the record
        R.append(r) # add to the record

        # player-1 (random select action, without train)
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        _, s, done = env.step(a) # apply the action
        if done:
            S.append(None)
            break
        S.append(s) # add to the record
    score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
    return S, A, R


# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!\n")
        return 0
    elif r == -1:
        print(f"====> player-{1-player} won!\n")
        return 1


def gameplay_human():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a = int(input("Input draw cards (1/2/3): "))-1
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!")
    elif r == -1:
        print(f"====> player-{1-player} won!")

gamma = 0.99

"""
m = number of actions
set pi[s,a] = epsilon/m  for all a
set pi[s,a] = epsilon/m + (1-epsilon)  IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
    m = len(actions)
    pi[:, :] = epsilon/m
    for s in range(len(states)):
        a_max = np.argmax(q[s, :])
        pi[s, a_max] += (1-epsilon)
    return pi

"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def q_learning(epoch, alpha=0.1, episodes=20):
    for episode in range(episodes):
        # gather/sample transitions
        S, A, R = run_episode(epoch, episode+1)
        for i in range(len(A)):
            s = S[i]
            a = A[i]
            r = R[i]
            s_next = S[i+1]
            if s_next is None:
                q[s, a] = -100000
            else:
                q_target = q_new = r+gamma*np.max(q[s_next, :])
                q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
            print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")


dq_min = 0.01


def train():
    global pi
    for epoch in range(ep):
        q_old = q.copy()
        q_learning(epoch+1, episodes=epi)
        dq = np.mean(np.abs(q_old-q))
        if dq<dq_min:
            break
        pi = epsilon_greedy(q)


def plot_ai_score(score_board, play_mode):
    player_0_wins = []
    for epoch in range(0, ep*epi, epi):
        score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
        player_1_win_count = np.count_nonzero(score_stats_epoch)
        player_0_win_count = epi-player_1_win_count
        player_0_wins.append(player_0_win_count)
    plt.figure(figsize=(12, 4))
    plt.plot(range(1, ep+1), player_0_wins, 'b-')
    plt.title("[Q-Learning] AI Player(player-0)'s Score"+" / "+play_mode)
    plt.xlabel("number of epoch")
    plt.ylabel("number of wins")
    plt.tight_layout()
    plt.show()


# game play before train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_before_train.append([epoch+1, episode, score])
        print()
    print()

# train
train()
pi = epsilon_greedy(q, 0.0)

# game play after train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_after_train.append([epoch+1, episode, score])
        print()
    print()

# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")

# enjoy game!
gameplay_human()

4.2. using SARSA

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(300)

ep = 100 # number of epoch
epi = 20 # number of episode

score_board_before_train = []
score_board_in_train = []
score_board_after_train = []


class LastCard:
    def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
        self.n = n
        self.states = [s for s in range(1, self.n+3)]
        self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards

    def reset(self):
        self.s = 1 # choose n state randomly
        self.state = self.states[self.s]
        return self.s

    def step(self, a):
        action = self.actions[a]
        state = self.state
        if state+action == self.n+1: # goal
            reward = 1
            done = True
            new_state = 0
            return reward, new_state, done
        elif state+action > self.n+1:
            reward = -1
            done = True
            new_state = None
            return reward, new_state, done

        last_card = state
        if action == 0:
            last_card += 1
        elif action == 1:
            last_card += 2
        elif action == 2:
            last_card += 3
        self.state = new_state = last_card
        self.s = s_new = self.states.index(self.state)
        reward = -1 # each step, get -1
        done = False
        return reward, self.s, done


""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions

""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)


def run_episode(epoch, episode):
    s = env.reset()
    done = False
    S, A, R = [s], [], []
    r = 0
    while not done:
        # player-0 (AI)
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        r, s, done = env.step(a) # apply the action
        if done:
            break
        A.append(a) # add to the record
        R.append(r) # add to the record

        # player-1 (random select action, without train)
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        _, s, done = env.step(a) # apply the action
        if done:
            S.append(None)
            break
        S.append(s) # add to the record
    score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
    return S, A, R


# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!\n")
        return 0
    elif r == -1:
        print(f"====> player-{1-player} won!\n")
        return 1


def gameplay_human():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a = int(input("Input draw cards (1/2/3): "))-1
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!")
    elif r == -1:
        print(f"====> player-{1-player} won!")

gamma = 0.99

"""
m = number of actions
set pi[s,a] = epsilon/m  for all a
set pi[s,a] = epsilon/m + (1-epsilon)  IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
    m = len(actions)
    pi[:, :] = epsilon/m
    for s in range(len(states)):
        a_max = np.argmax(q[s, :])
        pi[s, a_max] += (1-epsilon)
    return pi

"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def sarsa(epoch, alpha=0.1, episodes=20):
    for episode in range(episodes):
        # gather/sample transitions
        S, A, R = run_episode(epoch, episode+1)
        for i in range(len(A)):
            s = S[i]
            a = A[i]
            r = R[i]
            s_next = S[i+1]
            a_next = np.argmax(epsilon_greedy(q)[s_next, :])
            if s_next is None:
                q[s, a] = -100000
            else:
                q_target = q_new = r+gamma*q[s_next, a_next]
                q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
            print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")


dq_min = 0.01


def train():
    global pi
    for epoch in range(ep):
        q_old = q.copy()
        sarsa(epoch+1, episodes=epi)
        dq = np.mean(np.abs(q_old-q))
        if dq<dq_min:
            break
        pi = epsilon_greedy(q)


def plot_ai_score(score_board, play_mode):
    player_0_wins = []
    for epoch in range(0, ep*epi, epi):
        score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
        player_1_win_count = np.count_nonzero(score_stats_epoch)
        player_0_win_count = epi-player_1_win_count
        player_0_wins.append(player_0_win_count)
    plt.figure(figsize=(12, 4))
    plt.plot(range(1, ep+1), player_0_wins, 'b-')
    plt.title("[SARSA] AI Player(player-0)'s Score"+" / "+play_mode)
    plt.xlabel("number of epoch")
    plt.ylabel("number of wins")
    plt.tight_layout()
    plt.show()


# game play before train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_before_train.append([epoch+1, episode, score])
        print()
    print()

# train
train()
pi = epsilon_greedy(q, 0.0)

# game play after train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_after_train.append([epoch+1, episode, score])
        print()
    print()

# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")

# enjoy game!
gameplay_human()

728x90

저작자표시 비영리 동일조건 (새창열림)

'Graduate School > Topics on AI' 카테고리의 다른 글

Mine Sweeper Game (0)	2024.09.10
Solving Maze using Reinforcement Learning (0)	2024.09.10
AutoEncoder Implementation (0)	2024.09.10

Contents

새소식

인기 검색어

Last Card Game

1. Game rule

2. Gameplay

3. The Environment

4. Solution

4.1. using Q-Learning

4.2. using SARSA

'Graduate School > Topics on AI' 카테고리의 다른 글

당신이 좋아할만한 콘텐츠

티스토리툴바