Graduate School/Topics on AI

Last Card Game

  • -
728x90
반응형

1. Game rule

  • There are 100 cards
  • Two players
    • player-0 (AI), player-1 (human)
    • Player-turn sequence: player0-player1-player0-player1- ….
  • Each player will draw up to 3 cards in his turn
  • The player who draws the 100th card (the last card) wins!

 

2. Gameplay

  • # of drawn cards = 0, Player-0 draws 3 cards
  • # of drawn cards = 3, Player-1 draws 1 card
  • # of drawn cards = 4, Player-0 draws 1 card
  • # of drawn cards = 5, Player-1 draws 2 cards
  • # of drawn cards = 7, Player-0 draws 3 cards
  • Player-0 won! He took the last card

 

3. The Environment

  • 𝑛: total number of cards (=100)
  • State: 𝑠 ∈ {0,1,2, … , 𝑛}
    • Total # of drawn cards
    • == the recent-most card no.
  • Action: 𝑎 ∈ {0,1,2}
    • 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards
  • Reward: your choice

 

4. Solution

4.1. using Q-Learning

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(300)

ep = 100 # number of epoch
epi = 20 # number of episode

score_board_before_train = []
score_board_in_train = []
score_board_after_train = []


class LastCard:
    def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
        self.n = n
        self.states = [s for s in range(1, self.n+3)]
        self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards

    def reset(self):
        self.s = 1 # choose n state randomly
        self.state = self.states[self.s]
        return self.s

    def step(self, a):
        action = self.actions[a]
        state = self.state
        if state == (1,1): # goal
            reward = 0
            done = True
            new_state = 0
            return reward, new_state, done
        elif self.m.maze_map[state][action] == 0:
            reward = -10
            done= True
            new_state = None
            return reward, new_state, done

        x, y = state
        if action == 'E':
            y += 1
        elif action == 'W':
            y -= 1
        elif action == 'N':
            x -= 1
        elif action == 'S':
            x += 1
        self.state = new_state = (x, y)
        self.s = s_new = self.states.index(self.state)
        reward = -1 # each step, get -1
        done = False
        return reward, self.s, done


""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions

""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)


def run_episode(epoch, episode):
    s = env.reset()
    done = False
    S, A, R = [s], [], []
    r = 0
    while not done:
        # player-0 (AI)
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        r, s, done = env.step(a) # apply the action
        if done:
            break
        A.append(a) # add to the record
        R.append(r) # add to the record

        # player-1 (random select action, without train)
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        _, s, done = env.step(a) # apply the action
        if done:
            S.append(None)
            break
        S.append(s) # add to the record
    score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
    return S, A, R


# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!\n")
        return 0
    elif r == -1:
        print(f"====> player-{1-player} won!\n")
        return 1


def gameplay_human():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a = int(input("Input draw cards (1/2/3): "))-1
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!")
    elif r == -1:
        print(f"====> player-{1-player} won!")

gamma = 0.99

"""
m = number of actions
set pi[s,a] = epsilon/m  for all a
set pi[s,a] = epsilon/m + (1-epsilon)  IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
    m = len(actions)
    pi[:, :] = epsilon/m
    for s in range(len(states)):
        a_max = np.argmax(q[s, :])
        pi[s, a_max] += (1-epsilon)
    return pi

"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def q_learning(epoch, alpha=0.1, episodes=20):
    for episode in range(episodes):
        # gather/sample transitions
        S, A, R = run_episode(epoch, episode+1)
        for i in range(len(A)):
            s = S[i]
            a = A[i]
            r = R[i]
            s_next = S[i+1]
            if s_next is None:
                q[s, a] = -100000
            else:
                q_target = q_new = r+gamma*np.max(q[s_next, :])
                q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
            print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")


dq_min = 0.01


def train():
    global pi
    for epoch in range(ep):
        q_old = q.copy()
        q_learning(epoch+1, episodes=epi)
        dq = np.mean(np.abs(q_old-q))
        if dq<dq_min:
            break
        pi = epsilon_greedy(q)


def plot_ai_score(score_board, play_mode):
    player_0_wins = []
    for epoch in range(0, ep*epi, epi):
        score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
        player_1_win_count = np.count_nonzero(score_stats_epoch)
        player_0_win_count = epi-player_1_win_count
        player_0_wins.append(player_0_win_count)
    plt.figure(figsize=(12, 4))
    plt.plot(range(1, ep+1), player_0_wins, 'b-')
    plt.title("[Q-Learning] AI Player(player-0)'s Score"+" / "+play_mode)
    plt.xlabel("number of epoch")
    plt.ylabel("number of wins")
    plt.tight_layout()
    plt.show()


# game play before train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_before_train.append([epoch+1, episode, score])
        print()
    print()

# train
train()
pi = epsilon_greedy(q, 0.0)

# game play after train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_after_train.append([epoch+1, episode, score])
        print()
    print()

# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")

# enjoy game!
gameplay_human()

4.2. using SARSA

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(300)

ep = 100 # number of epoch
epi = 20 # number of episode

score_board_before_train = []
score_board_in_train = []
score_board_after_train = []


class LastCard:
    def __init__(self, n=20): # n: number of cards; start with 10 or 20, final results should be for n=100
        self.n = n
        self.states = [s for s in range(1, self.n+3)]
        self.actions = [0, 1, 2] # 0: draw 1 card, 1: draw 2 cards, 2: draw 3 cards

    def reset(self):
        self.s = 1 # choose n state randomly
        self.state = self.states[self.s]
        return self.s

    def step(self, a):
        action = self.actions[a]
        state = self.state
        if state+action == self.n+1: # goal
            reward = 1
            done = True
            new_state = 0
            return reward, new_state, done
        elif state+action > self.n+1:
            reward = -1
            done = True
            new_state = None
            return reward, new_state, done

        last_card = state
        if action == 0:
            last_card += 1
        elif action == 1:
            last_card += 2
        elif action == 2:
            last_card += 3
        self.state = new_state = last_card
        self.s = s_new = self.states.index(self.state)
        reward = -1 # each step, get -1
        done = False
        return reward, self.s, done


""" Initialize the environment """
env = LastCard(100)
states = env.states
actions = env.actions

""" Initialize the value and policy """
q = np.zeros(shape=[len(states), len(actions)])
pi = np.ones(shape=[len(states), len(actions)])*(1/3.0)
pi_without_train = np.ones(shape=[len(states), len(actions)])*(1/3.0)


def run_episode(epoch, episode):
    s = env.reset()
    done = False
    S, A, R = [s], [], []
    r = 0
    while not done:
        # player-0 (AI)
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        r, s, done = env.step(a) # apply the action
        if done:
            break
        A.append(a) # add to the record
        R.append(r) # add to the record

        # player-1 (random select action, without train)
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        _, s, done = env.step(a) # apply the action
        if done:
            S.append(None)
            break
        S.append(s) # add to the record
    score_board_in_train.append([epoch, episode, 0 if r==1 else 1]) # for visualization
    return S, A, R


# after completing the environment class, run gameplay() to verify if it works properly
def gameplay_auto():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a_prob_without_train = pi_without_train[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob_without_train)
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!\n")
        return 0
    elif r == -1:
        print(f"====> player-{1-player} won!\n")
        return 1


def gameplay_human():
    s = env.reset()
    done = False
    while not done:
        player = 0
        a_prob = pi[s, :]
        a = np.random.choice([0, 1, 2], p=a_prob) # sample the action
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a)  # apply the action
        if done:
            break
        player = 1
        a = int(input("Input draw cards (1/2/3): "))-1
        print(f"player-{player} > current_state: {s:2d}. draw cards (1/2/3): {a+1} {[s+i for i in range(a+1)]}")
        r, s, done = env.step(a) # apply the action
        if done:
            break
    if r == 1: # player
        print(f"====> player-{player} won!")
    elif r == -1:
        print(f"====> player-{1-player} won!")

gamma = 0.99

"""
m = number of actions
set pi[s,a] = epsilon/m  for all a
set pi[s,a] = epsilon/m + (1-epsilon)  IF a = argmax_a q(s,a)
"""
def epsilon_greedy(q, epsilon=0.3):
    m = len(actions)
    pi[:, :] = epsilon/m
    for s in range(len(states)):
        a_max = np.argmax(q[s, :])
        pi[s, a_max] += (1-epsilon)
    return pi

"""
q_new = r + gamma*max_a'(q(s',a'))
q(s,a) = q(s,a) + alpha*(q_new - q(s,a))
"""
def sarsa(epoch, alpha=0.1, episodes=20):
    for episode in range(episodes):
        # gather/sample transitions
        S, A, R = run_episode(epoch, episode+1)
        for i in range(len(A)):
            s = S[i]
            a = A[i]
            r = R[i]
            s_next = S[i+1]
            a_next = np.argmax(epsilon_greedy(q)[s_next, :])
            if s_next is None:
                q[s, a] = -100000
            else:
                q_target = q_new = r+gamma*q[s_next, a_next]
                q[s, a] = q[s, a]+alpha*(q_new-q[s, a])
            print(f"epoch: {epoch:4d} -- episode: {episode+1:4d} -- q[{s:2d}, {a}]: {q[s, a]:14.6f}")


dq_min = 0.01


def train():
    global pi
    for epoch in range(ep):
        q_old = q.copy()
        sarsa(epoch+1, episodes=epi)
        dq = np.mean(np.abs(q_old-q))
        if dq<dq_min:
            break
        pi = epsilon_greedy(q)


def plot_ai_score(score_board, play_mode):
    player_0_wins = []
    for epoch in range(0, ep*epi, epi):
        score_stats_epoch = score_board[epoch:epoch+epi][:, -1]
        player_1_win_count = np.count_nonzero(score_stats_epoch)
        player_0_win_count = epi-player_1_win_count
        player_0_wins.append(player_0_win_count)
    plt.figure(figsize=(12, 4))
    plt.plot(range(1, ep+1), player_0_wins, 'b-')
    plt.title("[SARSA] AI Player(player-0)'s Score"+" / "+play_mode)
    plt.xlabel("number of epoch")
    plt.ylabel("number of wins")
    plt.tight_layout()
    plt.show()


# game play before train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_before_train.append([epoch+1, episode, score])
        print()
    print()

# train
train()
pi = epsilon_greedy(q, 0.0)

# game play after train
for epoch in range(ep):
    print(f"++++++++++++++++")
    print(f"+ epoch : {epoch+1:4d} +")
    print(f"++++++++++++++++")
    for episode in range(epi):
        print(f"======== episode : {episode+1:3d} ========")
        score = gameplay_auto()
        score_board_after_train.append([epoch+1, episode, score])
        print()
    print()

# score visualization
plot_ai_score(np.array(score_board_before_train), "Before Training")
plot_ai_score(np.array(score_board_in_train), "In Training")
plot_ai_score(np.array(score_board_after_train), "After Training")

# enjoy game!
gameplay_human()
728x90
반응형

'Graduate School > Topics on AI' 카테고리의 다른 글

Mine Sweeper Game  (0) 2024.09.10
Solving Maze using Reinforcement Learning  (0) 2024.09.10
AutoEncoder Implementation  (0) 2024.09.10
Contents

포스팅 주소를 복사했습니다

이 글이 도움이 되었다면 공감 부탁드립니다.