Graduate School/Topics on AI
Mine Sweeper Game
- -
728x90
반응형
Minesweeper using Reinforcement Learning¶
Import Libraires¶
In [1]:
import pandas as pd
import numpy as np
from itertools import product
import random
from random import choice
from collections import namedtuple
from scipy.signal import convolve2d
from tqdm import trange
from time import sleep
import matplotlib.pyplot as plt
from IPython.display import clear_output
import ipywidgets as widgets
%matplotlib qt
%matplotlib inline
Minesweeper Class¶
In [2]:
class Minesweeper(object):
def __init__(self, difficulty): # difficulty is consist of [[width, height], number of bomb]
self.size = (difficulty[0][1], difficulty[0][0])
self.discovered = np.zeros(self.size, dtype=bool)
is_bomb = np.zeros(difficulty[0][1]*difficulty[0][0], dtype=bool)
is_bomb[:difficulty[1]] = True
np.random.shuffle(is_bomb)
self.is_bomb = is_bomb.reshape(self.size)
neighbour_kernel = np.ones((3, 3))
neighbour_kernel[1, 1] = 0
self.n_neighbours = convolve2d(self.is_bomb, neighbour_kernel, mode="same").astype(int)
self.dead = False
def reset(self):
self.dead = False
self.discovered = np.zeros_like(self.discovered)
@property
def board_numpy(self):
array = np.zeros_like(self.is_bomb, dtype='U')
array[self.is_bomb] = 'B'
array = np.where(~self.is_bomb, self.n_neighbours, array)
array = np.where(self.discovered, array, '?')
return array
@property
def board_show(self): # Returns a grid view of the game using dataframes
array = self.board_numpy
df = pd.DataFrame(array)
def style(cell):
if cell == "B":
return "background-color : indianred"
elif cell == "?":
return ""
else:
return "background-color : lightgreen"
df = df.style.applymap(style)
return df
@property
def map_str(self):
array = self.board_numpy
return "\n".join(map(lambda row: "".join(map(str, row)), array))
@property
def is_terminal(self):
n_undiscovered_non_bombs = (~self.is_bomb & ~self.discovered).sum()
return self.dead or n_undiscovered_non_bombs == 0
def click(self, row, column):
self.discovered[row, column] = True
if self.is_bomb[row, column]:
self.dead = True
reward = -100000
elif self.is_terminal:
if self.dead:
reward = -100000
else:
reward = 100000
else:
reward = -1
return reward
def gameplay_human(self):
self.reset()
display(self.board_show)
print("Enter q to exit.")
while not self.is_terminal:
try:
row, column = map(int, input("Enter row and column numbers:").split(" "))
except:
print("aborted..")
break
self.click(row, column)
clear_output(wait=True)
display(self.board_show)
if self.dead:
print("you loss")
elif self.is_terminal:
print("you won!")
print("re-run me for a new game!")
Game Difficulty¶
In [3]:
difficulty = {"beginning" : [[9, 9], 10], # difficulty is consist of [[width, height], number of bomb]
"intermediate" : [[16, 16], 40],
"advanced" : [[30, 16], 99]}
Play Game¶
In [4]:
game = Minesweeper(difficulty["advanced"])
game.gameplay_human()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
1 | 0 | 0 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
2 | ? | ? | 1 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
3 | ? | ? | ? | 2 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
4 | ? | ? | ? | ? | 0 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
5 | ? | ? | ? | ? | ? | 0 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
6 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
7 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
8 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
9 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
10 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
11 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
12 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
13 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
14 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
15 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? |
Enter row and column numbers:3 aborted.. re-run me for a new game!
Reinforcement Learning (Q-Learning)¶
In [5]:
def q_learning(game, episodes, alpha, gamma, epsilon):
click_count_list = []
is_win_list = []
size = game.size
actions = list(product(range(size[0]), range(size[1])))
q_table = {(i, j): np.zeros(len(actions)) for i in range(size[0]) for j in range(size[1])}
for episode in trange(episodes, desc="Training", unit="episodes"):
game.reset()
state = (0, 0) # (random.choice(range(size[0])), random.choice(range(size[1])))
click_count = 0
while not game.is_terminal:
click_count += 1
available_actions = [i for i, action in enumerate(actions) if not game.discovered[action[0]][action[1]]]
if np.random.rand() < epsilon:
action = random.choice(available_actions)
else:
action = np.argmax(q_table[state][available_actions])
action = available_actions[action]
reward = game.click(*actions[action])
next_state = actions[action]
q_table[state][action] = q_table[state][action]+alpha*(reward+gamma*np.max(q_table[next_state])-q_table[state][action])
state = next_state
if game.dead:
is_win_list.append(False)
else:
is_win_list.append(True)
click_count_list.append(click_count)
return q_table, click_count_list, is_win_list
Learning¶
In [15]:
def train(game_difficulty, episodes, alpha=0.01, gamma=0.99, epsilon=0.1):
game = Minesweeper(game_difficulty)
q_table, click_count_list, is_win_list = q_learning(game, episodes, alpha, gamma, epsilon)
goal_click_count = game_difficulty[0][1]*game_difficulty[0][0]-game_difficulty[1]
max_click_count = max(click_count_list)
win_count = sum(is_win_list)
max_click_loss_count = click_count_list.count(max_click_count)-win_count
print(f"goal click = {goal_click_count}")
print(f"max click = {max_click_count}")
print(f"max click(win) = {win_count}")
print(f"max click(loss) = {max_click_loss_count}")
print(f"first max click episode = {click_count_list.index(max_click_count)}")
if win_count > 0:
print(f"first win episode = {is_win_list.index(True)}")
else:
print(f"first win episode = None")
plt.figure(figsize=(12, 4))
plt.plot(click_count_list)
plt.ylim(0, goal_click_count)
plt.show()
Difficulty : Beginning¶
In [12]:
train(difficulty["beginning"], episodes=5000)
Training: 100%|██████████████████████| 5000/5000 [00:09<00:00, 553.53episodes/s]
goal click = 71 max click = 71 max click(win) = 525 max click(loss) = 51 first max click episode = 790 first win episode = 790
Difficulty : Intermediate¶
In [11]:
train(difficulty["intermediate"], episodes=100000)
Training: 100%|██████████████████| 100000/100000 [08:26<00:00, 197.53episodes/s]
goal click = 216 max click = 216 max click(win) = 39 max click(loss) = 5 first max click episode = 10378 first win episode = 11382
Difficulty : Advanced¶
In [9]:
train(difficulty["advanced"], episodes=10000000)
Training: 100%|███████████| 10000000/10000000 [22:23:51<00:00, 124.02episodes/s]
goal click = 381 max click = 381 max click(win) = 2 max click(loss) = 0 first max click episode = 5353892 first win episode = 5353892
Ablation Study (in Intermediate Difficulty)¶
$\epsilon$ value variation¶
In [13]:
train(difficulty["intermediate"], episodes=100000, epsilon=0.1)
Training: 100%|██████████████████| 100000/100000 [08:26<00:00, 197.26episodes/s]
goal click = 216 max click = 216 max click(win) = 40 max click(loss) = 2 first max click episode = 10533 first win episode = 10533
In [21]:
train(difficulty["intermediate"], episodes=100000, epsilon=0.25)
Training: 100%|██████████████████| 100000/100000 [03:50<00:00, 433.65episodes/s]
goal click = 216 max click = 178 max click(win) = 0 max click(loss) = 1 first max click episode = 12333 first win episode = None
In [16]:
train(difficulty["intermediate"], episodes=100000, epsilon=0.5)
Training: 100%|██████████████████| 100000/100000 [01:56<00:00, 857.65episodes/s]
goal click = 216 max click = 112 max click(win) = 0 max click(loss) = 1 first max click episode = 80817 first win episode = None
In [22]:
train(difficulty["intermediate"], episodes=100000, epsilon=0.75)
Training: 100%|█████████████████| 100000/100000 [01:17<00:00, 1290.60episodes/s]
goal click = 216 max click = 85 max click(win) = 0 max click(loss) = 1 first max click episode = 62118 first win episode = None
In [17]:
train(difficulty["intermediate"], episodes=100000, epsilon=0.99)
Training: 100%|█████████████████| 100000/100000 [00:57<00:00, 1751.72episodes/s]
goal click = 216 max click = 63 max click(win) = 0 max click(loss) = 1 first max click episode = 18102 first win episode = None
$\gamma$ value variation¶
In [18]:
train(difficulty["intermediate"], episodes=100000, gamma=0.99)
Training: 100%|██████████████████| 100000/100000 [08:34<00:00, 194.55episodes/s]
goal click = 216 max click = 216 max click(win) = 42 max click(loss) = 10 first max click episode = 13244 first win episode = 13244
In [23]:
train(difficulty["intermediate"], episodes=100000, gamma=0.75)
Training: 100%|██████████████████| 100000/100000 [08:29<00:00, 196.25episodes/s]
goal click = 216 max click = 216 max click(win) = 40 max click(loss) = 6 first max click episode = 17118 first win episode = 17118
In [19]:
train(difficulty["intermediate"], episodes=100000, gamma=0.5)
Training: 100%|██████████████████| 100000/100000 [08:31<00:00, 195.69episodes/s]
goal click = 216 max click = 216 max click(win) = 49 max click(loss) = 8 first max click episode = 10149 first win episode = 10149
In [24]:
train(difficulty["intermediate"], episodes=100000, gamma=0.25)
Training: 100%|██████████████████| 100000/100000 [08:31<00:00, 195.49episodes/s]
goal click = 216 max click = 216 max click(win) = 45 max click(loss) = 5 first max click episode = 11296 first win episode = 11296
In [20]:
train(difficulty["intermediate"], episodes=100000, gamma=0.1)
Training: 100%|██████████████████| 100000/100000 [08:26<00:00, 197.38episodes/s]
goal click = 216 max click = 216 max click(win) = 57 max click(loss) = 7 first max click episode = 12414 first win episode = 12414
learn(dict(map_size=difficulty["beginning"], window_size=1), episodes=10_000)
728x90
반응형
'Graduate School > Topics on AI' 카테고리의 다른 글
Last Card Game (2) | 2024.09.10 |
---|---|
Solving Maze using Reinforcement Learning (0) | 2024.09.10 |
AutoEncoder Implementation (0) | 2024.09.10 |
Contents
소중한 공감 감사합니다