Python Overflow 实现 TD 学习

Python Overflow Implementing TD Learning

提问人:jroc 提问时间:9/29/2023 最后编辑:jroc 更新时间:9/29/2023 访问量:55

问:

尝试在悬崖行走的健身房环境中实现 td lambda,但我得到的 V 函数值爆炸,可能是因为溢出。有趣的是,溢出仅在 lambda=1 时发生。蟒蛇:3.11.4。使用此环境 https://gymnasium.farama.org/environments/toy_text/cliff_walking/

env = gym.make('CliffWalking-v0')

def td_lambda(lmbda):

    n_states = 48
    Vf = np.zeros(n_states) # init value function
    e_trace = np.zeros(n_states)
    all_neu = np.zeros(1000)
    avg_neu = []
    alpha = 0.5
    gamma = 1.0

    for episode in range(1000):
        current_state, _ = env.reset() # reset environment on each episode start
        neu = np.zeros(20)
        for k in range(20):
            action = np.random.choice(3) # select action uniformly
            if k == 0: # go up on first action
                action = 0
            next_state, reward, done, _, _ = env.step(action)
            td = reward + gamma * Vf[next_state] - Vf[current_state]
            e_trace *= lmbda * gamma
            e_trace[current_state] += 1.0
            # if lmbda != 0:
            #     e_trace *= lmbda * gamma
            #     e_trace[current_state] += 1
            # else:
            #     e_trace = np.ones(n_states)
            Vf[current_state] += alpha * e_trace[current_state] * td
            current_state = next_state
            neu[k] = np.square(td * e_trace[current_state])

            if done:
                break

        all_neu[episode] = np.sum(neu) / 20
        if (episode+1) % 10 == 0:
            prev_10 = all_neu[episode-9:episode+1]
            assert len(prev_10) == 10
            avg_neu.append(np.mean(prev_10))

    avg_neu.insert(0, 0)

    fig, ax = plt.subplots()
    ax.plot(range(0, 1001, 10), avg_neu)
    ax.set_title(f'Average NEU where lambda = {lmbda}')
    return Vf

td_lambda(1.0)

生成以下警告:

/var/folders/x4/19d18bzn61lc0qzzl1j__tpw0000gn/T/ipykernel_36323/1625313874.py:29: RuntimeWarning: overflow encountered in square
  neu[k] = np.square(td * e_trace[current_state])
/var/folders/x4/19d18bzn61lc0qzzl1j__tpw0000gn/T/ipykernel_36323/1625313874.py:27: RuntimeWarning: overflow encountered in scalar multiply
  Vf[current_state] += alpha * e_trace[current_state] * td
/var/folders/x4/19d18bzn61lc0qzzl1j__tpw0000gn/T/ipykernel_36323/1625313874.py:29: RuntimeWarning: overflow encountered in scalar multiply
  neu[k] = np.square(td * e_trace[current_state])
/var/folders/x4/19d18bzn61lc0qzzl1j__tpw0000gn/T/ipykernel_36323/1625313874.py:27: RuntimeWarning: invalid value encountered in scalar add
  Vf[current_state] += alpha * e_trace[current_state] * td

前 100 个 Vf[current_state] 值的示例:

-0.5
-0.75
-1.75
-2.75
-151.75
144.25
-0.125
-0.5
-0.75
-1.0
-0.5
-0.5
-0.5
-0.5
-0.5
-1.0
-0.5
-0.5
-0.5
22.125
-219.1875
-3.875
-1.75
-1.5
-1.0
-1.0
-1.75
-1.0
-0.5
-0.5
-0.75
-1.5
-2.5
-0.75
-159.59375
-519.1875
1280.90625
3.3125
-0.5
-0.5
-3833.46875
-14.875
-1.75
-1.0
-1.0
-2.75
-5.75
-8.75
3.625
-1.75
-3933.46875
-3837.96875
-4337.96875
19433.546875
31.9375
-1.5
-2.5
-2.25
4.4375
-15.5
-96982.109375
-106.8125
-242702.6484375
532700.8203125
2397523.03515625
532693.8203125
532686.3203125
15451372.0390625
-10803610.3828125
485397.796875
-5900.953125
23178874.79296875
15450522.0390625
15449622.0390625
15448672.0390625
-247074162.1796875
-1310291650.765625
-11410857802.832031
-61913688569.16406
-11410857813.832031
-592193411511.6504
-3508731887701.825
-35590655125805.75
-228082194554436.28
-35590655127055.75
-35590655128355.75
-35590655129705.75
-2730472207075947.0
-1.8996007288465772e+16
-2730472207077397.0
-2730472207077412.0
-2730472207078962.0
-2730472207078978.0
-2730472207078994.5
-2730472207079011.5
-2.873773361313473e+17
-2.166046638031518e+18
-2.8737733613134902e+17
-2.873773361313508e+17
-2.873773361313526e+17

我尝试将所有变量转换为浮点数。

Python 浮点 强化学习 时差

评论


答: 暂无答案