在后面介绍。写成代码下面所示。
class Mdp:
def __init__(self):
self.states = [1,2,3,4,5,6,7,8] # 0 indicates end
self.terminal_states = dict()
self.terminal_states[6] = 1
self.terminal_states[7] = 1
self.terminal_states[8] = 1
self.actions = ['n','e','s','w']
self.rewards = dict();
self.rewards['1_s'] = -1.0
self.rewards['3_s'] = 1.0
self.rewards['5_s'] = -1.0
self.t = dict();
self.t['1_s'] = 6
self.t['1_e'] = 2
self.t['2_w'] = 1
self.t['2_e'] = 3
self.t['3_s'] = 7
self.t['3_w'] = 2
self.t['3_e'] = 4
self.t['4_w'] = 3
self.t['4_e'] = 5
self.t['5_s'] = 8
self.t['5_w'] = 4
self.gamma = 0.8
def transform(self, state, action): ##return is_terminal,state, reward
if state in self.terminal_states:
return True, state, 0
key = '%d_%s'%(state, action);
if key in self.t:
next_state = self.t[key];
else:
next_state = state
is_terminal = False
if next_state in self.terminal_states:
is_terminal = True
if key not in self.rewards:
r = 0.0
else:
r = self.rewards[key];
return is_terminal, next_state, r;
马尔科夫决策过程是强化学习的理论基础。不管我们是将强化学习应用于五子棋游戏、星际争霸还是机器人行走,我们都假设背后存在了一个马尔科夫决策过程。只不过有的时候我们知道马尔科夫决策过程所有信息(状态集合,动作集合,转移概率和奖励),有的时候我们只知道部分信息 (状态集合和动作集合),还有些时候马尔科夫决策过程的信息太大无法全部存储 (比如围棋的状态集合总数为