220\n o[\"collided\"] = o[\"in_hazard\"]\n return o\n\ndef wrap_(a): return (a + np.pi) % (2*np.pi) - np.pi\n\ndef reward(obs, action, next_obs):\n if next_obs[\"reached\"]: return 1.0\n return 0.0 # SPARSE — curve will stay flat near zero\nprint(\"harness ready; sparse reward defined (it will fail by design).\")\n","label":"1 — Tiny RL harness (provided) + your sparse reward (flat curve)"},{"code":"# Dense progress signal + terminal bonus; YOU add hazard + time penalties.\nHAZARD_PENALTY = 0.0 # <-- TUNE: must exceed the progress gained by clipping the corner\nTIME_PENALTY = 0.0 # <-- TUNE: small constant, e.g. 0.01, so it doesn't loiter\n\ndef reward(obs, action, next_obs):\n progress = obs[\"goal_dist\"] - next_obs[\"goal_dist\"] # >0 when getting closer\n r = 2.0 * progress\n if next_obs[\"reached\"]:\n r += 10.0\n if next_obs[\"in_hazard\"]:\n r -= HAZARD_PENALTY\n r -= TIME_PENALTY\n return r\nprint(\"shaped reward set. hazard_penalty =\", HAZARD_PENALTY, \" time_penalty =\", TIME_PENALTY)\n","label":"2 — Shaped reward (progress + your safety/time terms)"},{"code":"# A compact, seedable trainer: a softmax policy over 4 actions conditioned on a\n# coarse state bin; climbs the reward you designed. Deterministic for grading.\nimport numpy as np\n\ndef train(reward_fn, episodes=600, seed=440):\n r = np.random.default_rng(seed)\n theta = np.zeros((6, 4)) # 6 state bins x 4 actions\n def feat(o):\n b = min(5, int(o[\"goal_dist\"]))\n return b\n curve = []\n for ep in range(episodes):\n env = Arena(seed + (ep % 3)); o = env.reset()\n total = 0.0; grads = []\n for t in range(221):\n b = feat(o); z = theta[b] - theta[b].max(); p = np.exp(z); p /= p.sum()\n a = int(r.choice(4, p=p))\n no = env.step(a); rw = reward_fn(o, a, no); total += rw\n g = -p; g[a] += 1.0; grads.append((b, g, rw))\n o = no\n if o[\"done\"]: break\n for (b, g, rw) in grads:\n theta[b] += 0.02 * g * (total / 50.0) # crude REINFORCE update\n curve.append(total)\n train.policy = theta\n return curve\n\ndef smoothed(c, w=20):\n c = np.array(c); k = np.ones(w)/w\n return np.convolve(c, k, mode='valid')\n\ncurve = train(reward, 600, 440)\nfinal = float(smoothed(curve)[-1])\nprint(\"final smoothed reward:\", round(final, 2))\n","label":"3 — Train to convergence (deterministic policy-gradient-lite)"},{"code":"import numpy as np\ndef rollout(theta, seed):\n env = Arena(seed); o = env.reset()\n def feat(o): return min(5, int(o[\"goal_dist\"]))\n steps = 0\n for t in range(221):\n a = int(np.argmax(theta[feat(o)])); o = env.step(a); steps += 1\n if o[\"done\"]: break\n return o.get(\"reached\", False), o.get(\"collided\", False), steps\n\nsm = smoothed(curve); slope = sm[-1] - sm[-min(100, len(sm))]\nreached, collided, steps = rollout(train.policy, 440)\ncurve2 = train(reward, 600, 441); final2 = float(smoothed(curve2)[-1])\nif final < 6.0 or slope < 0:\n print(f\"FAIL: final smoothed reward {final:.1f} (need >=6.0, non-decreasing). Reward too \"\n f\"sparse -> add a dense per-step term for closing goal_dist (potential shaping).\")\nelif collided:\n print(\"FAIL: converged but collided in hazard — HAZARD_PENALTY is smaller than the \"\n \"progress gained by clipping the zone; raise it above that progress.\")\nelif not reached:\n print(\"FAIL: reward high but reached=False — shaping rewards moving, not arriving; \"\n \"keep the terminal bonus and confirm progress uses next_obs minus obs.\")\nelif steps > 220:\n print(f\"FAIL: reached but steps {steps} (>220) — no time penalty; subtract a small \"\n f\"constant each step (~0.01-0.05).\")\nelif final2 < 6.0:\n print(f\"FAIL: seed 440 passed but seed 441 reward {final2:.1f} — magnitudes tuned to one \"\n f\"init; prefer potential-based shaping invariant to seed.\")\nelse:\n print(f\"PASS: converged to {final:.1f}, reached goal safely in {steps} steps, \"\n f\"re-converges at seed 441 ({final2:.1f}).\")\n","label":"4 — Autograder (PASS = reward>=6, reach, safe, efficient, seed 441)"}],"intro":"Write the reward function (progress + safety + time) so the provided RL loop converges.","key":"programming/reinforcement-learning","kind":"python","title":"Reinforcement Learning"}">
PYTHON · NUMPY · IN-BROWSER
Reinforcement Learning
Write the reward function (progress + safety + time) so the provided RL loop converges.