diff --git a/lab10_Reinforcement_Learning/dqn_lunar_lander_demo.ipynb b/lab10_Reinforcement_Learning/dqn_lunar_lander_demo.ipynb new file mode 100644 index 0000000..8423c68 --- /dev/null +++ b/lab10_Reinforcement_Learning/dqn_lunar_lander_demo.ipynb @@ -0,0 +1,1101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# This is a slightly revison of https://www.kaggle.com/code/auxeno/dqn-on-lunar-lander-rl\n", + "# pip install pygame\n", + "# pip install swig\n", + "# pip install box2d\n", + "# pip install gymnasium\n", + "# pip install requests" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gymnasium as gym\n", + "import random\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "import base64, io\n", + "import os # For creating save directory\n", + "import time\n", + "import imageio\n", + "\n", + "from collections import deque, namedtuple\n", + "# For visualization\n", + "from gymnasium.wrappers import RecordVideo\n", + "from IPython.display import HTML\n", + "from IPython import display \n", + "import glob\n", + "import numpy as np\n", + "import random\n", + "import requests\n", + "import zipfile\n", + "import os\n", + "\n", + "torch.cuda.is_available()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Only for testing enviroment\n", + "env = gym.make('LunarLander-v3', render_mode=\"human\")\n", + "env.reset(seed=42)\n", + "\n", + "# Play one complete episode with random actions\n", + "while True:\n", + " action = env.action_space.sample() \n", + " _, _, terminated, truncated, _ = env.step(action)\n", + " if terminated or truncated:\n", + " break\n", + " \n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://i.imgur.com/tQ3zeQA.gif)\n", + "\n", + "## General Information\n", + "This information is from the official Gym documentation.\n", + "\n", + "https://www.gymlibrary.dev/environments/box2d/lunar_lander/\n", + "\n", + "| Feature Category | Details |\n", + "|-------------------|----------------------------------------|\n", + "| Action Space | Discrete(4) |\n", + "| Observation Shape | (8,) |\n", + "| Observation High | [1.5 1.5 5. 5. 3.14 5. 1. 1. ] |\n", + "| Observation Low | [-1.5 -1.5 -5. -5. -3.14 -5. -0. -0. ] |\n", + "| Import | `gym.make(\"LunarLander-v2\")` |\n", + "\n", + "## Description of Environment\n", + "\n", + "This environment is a classic rocket trajectory optimization problem. According to Pontryagin’s maximum principle, it is optimal to fire the engine at full throttle or turn it off. This is the reason why this environment has discrete actions: engine on or off.\n", + "\n", + "There are two environment versions: discrete or continuous. The landing pad is always at coordinates `(0,0)`. The coordinates are the first two numbers in the state vector. Landing outside of the landing pad is possible. Fuel is infinite, so an agent could learn to fly and then land on its first attempt.\n", + "\n", + "## Action Space\n", + "There are four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine.\n", + "\n", + "| Action | Result |\n", + "|---------|---------------------------------|\n", + "| 0 | Do nothing |\n", + "| 1 | Fire left orientation engine |\n", + "| 2 | Fire main engine |\n", + "| 3 | Fire right orientation engine |\n", + "\n", + "## Observation Space\n", + "The state is an 8-dimensional vector: the coordinates of the lander in `x` & `y`, its linear velocities in `x` & `y`, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not.\n", + "\n", + "| Observation | Value |\n", + "|--------------|-----------------------------------------|\n", + "| 0 | `x` coordinate (float) |\n", + "| 1 | `y` coordinate (float) |\n", + "| 2 | `x` linear velocity (float) |\n", + "| 3 | `y` linear velocity (float) |\n", + "| 4 | Angle in radians from -π to +π (float) |\n", + "| 5 | Angular velocity (float) |\n", + "| 6 | Left leg contact (bool) |\n", + "| 7 | Right leg contact (bool) |\n", + "\n", + "## Rewards\n", + "Reward for moving from the top of the screen to the landing pad and coming to rest is about 100-140 points. If the lander moves away from the landing pad, it loses reward. If the lander crashes, it receives an additional -100 points. If it comes to rest, it receives an additional +100 points. Each leg with ground contact is +10 points. Firing the main engine is -0.3 points each frame. Firing the side engine is -0.03 points each frame. Solved is 200 points.\n", + "\n", + "## Starting State\n", + "The lander starts at the top center of the viewport with a random initial force applied to its center of mass.\n", + "\n", + "## Episode Termination\n", + "The episode finishes if:\n", + "\n", + "1. The lander crashes (the lander body gets in contact with the moon);\n", + "\n", + "2. The lander gets outside of the viewport (`x` coordinate is greater than 1);\n", + "\n", + "3. The lander is not awake. From the Box2D docs, a body which is not awake is a body which doesn’t move and doesn’t collide with any other body:\n", + "\n", + "---\n", + "\n", + "## The Safe Agent (No AI, only one hardcoded rule)\n", + "We're going to implement a simple agent 'The Safe Agent' who will thrust upward if and only if the lander's `y` position is less than 0.5.\n", + "\n", + "In theory this agent shouldn't hit the ground as we have unlimited fuel, but let's see." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "class SafeAgent:\n", + " '''\n", + " An agent that will simply fly upward if the lander gets too close to the ground.\n", + " '''\n", + " def act(self, state):\n", + " '''\n", + " Decision making method. \n", + " Fly up if below the minimum height.\n", + " '''\n", + " MIN_HEIGHT = 1\n", + "\n", + " if state[1] < MIN_HEIGHT:\n", + " return 2\n", + " else:\n", + " return 0\n", + "\n", + "\n", + "def play_episode(env, agent, seed=42):\n", + " '''\n", + " Plays a full episode for a given agent, environment and seed.\n", + " '''\n", + " score = 0\n", + " state, _ = env.reset(seed=seed)\n", + " \n", + " while True:\n", + " action = agent.act(state)\n", + " state, reward, terminated, truncated, _ = env.step(action) \n", + " done = terminated or truncated\n", + "\n", + " score += reward\n", + "\n", + " # End the episode if done\n", + " if done:\n", + " break \n", + "\n", + "\n", + "env = gym.make('LunarLander-v3', render_mode=\"human\")\n", + "agent = SafeAgent()\n", + "\n", + "play_episode(env, agent)\n", + "\n", + "# Close the environment\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://i.imgur.com/qFNn9ai.gif)\n", + "\n", + "#### Observations:\n", + "- The safe agent may not have hit the ground, but it didn't take long to fly off screen, due to its inability to use the side engines.\n", + "\n", + "---\n", + "\n", + "## The Stable Agent (No AI, with a set of hardcoded rules)\n", + "Let's try to define and agent that can remain stable in the air.\n", + "\n", + "It will operate via the following rules:\n", + "\n", + "1. If below height of 1: action = 2 (main engine)\n", + "2. If angle is above π/50: action = 1 (fire right engine)\n", + "3. If angle is above π/50: action = 1 (fire left engine)\n", + "4. If x distance is above 0.4: action = 3 (fire left engine)\n", + "5. If x distance is below -0.4: action = 1 (fire left engine)\n", + "6. If below height of 1.5: action = 2 (main engine)\n", + "6. Else: action = 0 (do nothing)\n", + "\n", + "The idea is the lander will always use its main engine if it falls below a certain height, next it will prioritize stabilizing the angle of the lander, then the distance, then keeping it above another height. \n", + "\n", + "Let's see how this approach does:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "class StableAgent:\n", + " '''\n", + " An agent that attempts to fly the lander stably using a set of inflexible rules.\n", + " '''\n", + " def act(self, state):\n", + " '''\n", + " Decision making method.\n", + " Fly according to the rules described above.\n", + " '''\n", + " # Decision making thresholds\n", + " UPPER_MIN_Y = 1.5\n", + " LOWER_MIN_Y = 1\n", + " MIN_X = -0.4\n", + " MAX_X = 0.4\n", + " MIN_ANGLE = -3.14/50\n", + " MAX_ANGLE = 3.14/50\n", + "\n", + " # Convenient forms for angle, x and y coordinates\n", + " x = state[0]\n", + " y = state[1]\n", + " angle = state[4]\n", + "\n", + " # Avoiding magic numbers for readability\n", + " MAIN_ENGINE = 2\n", + " LEFT_ENGINE = 1\n", + " RIGHT_ENGINE = 3\n", + " DO_NOTHING = 0\n", + "\n", + " # If very low, be sure to use main engine\n", + " if y < LOWER_MIN_Y:\n", + " return MAIN_ENGINE\n", + "\n", + " # Try to keep angle within a small range\n", + " elif angle > MAX_ANGLE:\n", + " return RIGHT_ENGINE\n", + " elif angle < MIN_ANGLE:\n", + " return LEFT_ENGINE\n", + " \n", + " # Don't stray too far left or right\n", + " elif x > MAX_X:\n", + " return LEFT_ENGINE\n", + " elif x < MIN_X:\n", + " return RIGHT_ENGINE\n", + " \n", + " # If lander is stable, use main engine to maintain height\n", + " elif y < UPPER_MIN_Y:\n", + " return MAIN_ENGINE\n", + " \n", + " # Else do nothing\n", + " else:\n", + " return DO_NOTHING\n", + "\n", + "\n", + "env = gym.make('LunarLander-v3', render_mode=\"human\")\n", + "agent = StableAgent()\n", + "\n", + "play_episode(env, agent)\n", + "# Close the environment\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](https://i.imgur.com/Bdq1Hdl.gif)\n", + "\n", + "#### Observations:\n", + "- Crafting a straightforward set of rules to guide the lunar lander is more challenging than anticipated.\n", + "- Our initial efforts achieved some stability, but eventually, the lander lost control.\n", + "\n", + "---\n", + "\n", + "# The AI Agent (AI agent with Deep Reinforcement Learning)\n", + "To address this challenge, we'll use deep reinforcement learning techniques to train an agent to land the spacecraft.\n", + "\n", + "Simpler tabular methods are limited to discrete observation spaces, meaning there are a finite number of possible states. In `LunarLander-v3` however, we're dealing with a continuous range of states across 8 different parameters, meaning there are a near-infinite number of possible states. We could try to bin similar values into groups, but due to the sensitive controls of the game, even slight errors can lead to significant missteps.\n", + "\n", + "To get around this, we'll use a `neural network Q-function approximator`. This lets us predict the best actions to take for a given state, even when dealing with a vast number of potential states. It's a much better match for our complex landing challenge.\n", + "\n", + "## The DQN Algorithm:\n", + "\n", + "This breakthrough algorithm was used by Mihn et al in 2015 to achieve human-level performance on several Atari 2600 games. \n", + "\n", + "The original paper published in Nature can be viewed here:\n", + "\n", + "https://www.deepmind.com/publications/human-level-control-through-deep-reinforcement-learning\n", + "\n", + "The algorithm:\n", + "\n", + "1. **Initialization**: Begin by initializing the parameters for two neural networks, $Q(s,a)$ (referred to as the online network) and $\\hat{Q}(s,a)$ (known as the target network), with random weights. Both networks serve the function of mapping a state-action pair to a Q-value, which is an estimate of the expected return from that pair. Also, set the exploration probability $\\epsilon$ to 1.0, and create an empty replay buffer to store past transition experiences.\n", + "2. **Action Selection**: Utilize an epsilon-greedy strategy for action selection. With a probability of $\\epsilon$, select a random action $a$, but in all other instances, choose the action $a$ that maximizes the Q-value, i.e., $a = argmax_aQ(s,a)$.\n", + "3. **Experience Collection**: Execute the chosen action $a$ within the environment emulator and observe the resulting immediate reward $r$ and the next state $s'$.\n", + "4. **Experience Storage**: Store the transition $(s,a,r,s')$ in the replay buffer for future reference.\n", + "5. **Sampling**: Randomly sample a mini-batch of transitions from the replay buffer for training the online network.\n", + "6. **Target Computation**: For every transition in the sampled mini-batch, compute the target value $y$. If the episode has ended at this step, $y$ is simply the reward $r$. Otherwise, $y$ is the sum of the reward and the discounted estimated optimal future Q-value, i.e., $y = r + \\gamma \\max_{a' \\in A} \\hat{Q}(s', a')$\n", + "7. **Loss Calculation**: Compute the loss, which is the squared difference between the Q-value predicted by the online network and the computed target, i.e., $\\mathcal{L} = (Q(s,a) - y)^2$\n", + "8. **Online Network Update**: Update the parameters of the online network $Q(s,a)$ using Stochastic Gradient Descent (SGD) to minimize the loss.\n", + "9. **Target Network Update**: Every $N$ steps, update the target network by copying the weights from the online network to the target network $\\hat{Q}(s,a)$.\n", + "10. **Iterate**: Repeat the process from step 2 until convergence.\n", + "\n", + "### Defining the Deep Q-Network\n", + "Our network will be a simple feedforward neural network that takes the state as input and produces Q-values for each action as output. For `LunarLander-v2` the state is an 8-dimensional vector and there are 4 possible actions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "class DQN(torch.nn.Module):\n", + " '''\n", + " This class defines a deep Q-network (DQN), a type of artificial neural network used in reinforcement learning.\n", + " The DQN is used to estimate the Q-values, which represent the expected return for each action in each state.\n", + " \n", + " Parameters\n", + " ----------\n", + " state_size: int, default=8\n", + " The size of the state space.\n", + " action_size: int, default=4\n", + " The size of the action space.\n", + " hidden_size: int, default=64\n", + " The size of the hidden layers in the network.\n", + " '''\n", + " def __init__(self, state_size=8, action_size=4, hidden_size=64):\n", + " '''\n", + " Initialize a network with the following architecture:\n", + " Input layer (state_size, hidden_size)\n", + " Hidden layer 1 (hidden_size, hidden_size)\n", + " Output layer (hidden_size, action_size)\n", + " '''\n", + " super(DQN, self).__init__()\n", + " self.layer1 = torch.nn.Linear(state_size, hidden_size)\n", + " self.layer2 = torch.nn.Linear(hidden_size, hidden_size)\n", + " self.layer3 = torch.nn.Linear(hidden_size, action_size)\n", + "\n", + " def forward(self, state):\n", + " '''\n", + " Define the forward pass of the DQN. This function is called when the network is called to estimate Q-values.\n", + " \n", + " Parameters\n", + " ----------\n", + " state: torch.Tensor\n", + " The state for which to estimate the Q-values.\n", + "\n", + " Returns\n", + " -------\n", + " torch.Tensor\n", + " The estimated Q-values for each action in the input state.\n", + " '''\n", + " x = torch.relu(self.layer1(state))\n", + " x = torch.relu(self.layer2(x))\n", + " return self.layer3(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Defining the Replay Buffer\n", + "In the context of RL, we employ a structure known as the replay buffer, which utilizes a deque. The replay buffer stores and samples experiences, which helps us overcome the problem of *step correlation*.\n", + "\n", + "A *deque* (double-ended queue) is a data structure that enables the addition or removal of elements from both its ends, hence the name. It is particularly useful when there is a need for fast append and pop operations from either end of the container, which it provides at O(1) time complexity. In contrast, a list offers these operations at O(n) time complexity, making the deque a preferred choice in cases that necessitate more efficient operations.\n", + "\n", + "Moreover, a deque allows setting a maximum size. Once this maximum size is exceeded during an insertion (push) operation at the front, the deque automatically ejects the item at the rear, thereby maintaining its maximum length.\n", + "\n", + "In the replay buffer, the `push` method is utilized to add an experience. If adding this experience exceeds the maximum buffer size, the oldest (rear-most) experience is automatically removed. This approach ensures that the replay buffer always contains the most recent experiences up to its capacity.\n", + "\n", + "The `sample` method, on the other hand, is used to retrieve a random batch of experiences from the replay buffer. This randomness is critical in breaking correlations within the sequence of experiences, which leads to more robust learning.\n", + "\n", + "This combination of recency and randomness allows us to learn on new training data, without training samples being highly correlated." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class ReplayBuffer:\n", + " '''\n", + " This class represents a replay buffer, a type of data structure commonly used in reinforcement learning algorithms.\n", + " The buffer stores past experiences in the environment, allowing the agent to sample and learn from them at later times.\n", + " This helps to break the correlation of sequential observations and stabilize the learning process.\n", + " \n", + " Parameters\n", + " ----------\n", + " buffer_size: int, default=10000\n", + " The maximum number of experiences that can be stored in the buffer.\n", + " '''\n", + " def __init__(self, buffer_size=10000):\n", + " self.buffer = deque(maxlen=buffer_size)\n", + "\n", + " def push(self, state, action, reward, next_state, done):\n", + " '''\n", + " Add a new experience to the buffer. Each experience is a tuple containing a state, action, reward,\n", + " the resulting next state, and a done flag indicating whether the episode has ended.\n", + "\n", + " Parameters\n", + " ----------\n", + " state: array-like\n", + " The state of the environment before taking the action.\n", + " action: int\n", + " The action taken by the agent.\n", + " reward: float\n", + " The reward received after taking the action.\n", + " next_state: array-like\n", + " The state of the environment after taking the action.\n", + " done: bool\n", + " A flag indicating whether the episode has ended after taking the action.\n", + " '''\n", + " self.buffer.append((state, action, reward, next_state, done))\n", + "\n", + " def sample(self, batch_size):\n", + " '''\n", + " Randomly sample a batch of experiences from the buffer. The batch size must be smaller or equal to the current number of experiences in the buffer.\n", + "\n", + " Parameters\n", + " ----------\n", + " batch_size: int\n", + " The number of experiences to sample from the buffer.\n", + "\n", + " Returns\n", + " -------\n", + " tuple of numpy.ndarray\n", + " A tuple containing arrays of states, actions, rewards, next states, and done flags.\n", + " '''\n", + " states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))\n", + " return np.stack(states), actions, rewards, np.stack(next_states), dones\n", + "\n", + " def __len__(self):\n", + " '''\n", + " Get the current number of experiences in the buffer.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of experiences in the buffer.\n", + " '''\n", + " return len(self.buffer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define the DQN Agent\n", + "The DQN agent handles the interaction with the environment, selecting actions, collecting experiences, storing them in the replay buffer, and using these experiences to train the network. Let's walk through each part of this process:\n", + "\n", + "#### Initialisation\n", + "The `__init__` function sets up the agent:\n", + "\n", + "- `self.device`: We start by checking whether a GPU is available, and, if so, we use it, otherwise, we fall back to CPU. \n", + "- `self.gamma`: This is the discount factor for future rewards, used in the Q-value update equation.\n", + "- `self.batch_size`: This is the number of experiences we'll sample from the memory when updating the model.\n", + "- `self.q_network` and `self.target_network`: These are two instances of the Q-Network. The first is the network we're actively training, and the second is a copy that gets updated less frequently. This helps to stabilize learning.\n", + "- `self.optimizer`: This is the optimization algorithm used to update the Q-Network's parameters.\n", + "- `self.memory`: This is a replay buffer that stores experiences. It's an instance of the `ReplayBuffer` class.\n", + "\n", + "#### Step Function\n", + "The `step` function is called after each timestep in the environment:\n", + "\n", + "- The function starts by storing the new experience in the replay buffer.\n", + "- If enough experiences have been stored, it calls `self.update_model()`, which triggers a learning update.\n", + "\n", + "#### Action Selection\n", + "The act function is how the agent selects an action:\n", + "\n", + "- If a randomly drawn number is greater than $\\epsilon$, it selects the action with the highest predicted Q-value. This is known as exploitation: the agent uses what it has learned to select the best action.\n", + "- If the random number is less than $\\epsilon$, it selects an action randomly. This is known as exploration: the agent explores the environment to learn more about it.\n", + "\n", + "#### Model Update\n", + "The `update_model` function is where the learning happens:\n", + "\n", + "- It starts by sampling a batch of experiences from the replay buffer.\n", + "- It then calculates the current Q-values for the sampled states and actions, and the expected - Q-values based on the rewards and next states.\n", + "- It calculates the loss, which is the mean squared difference between the current and expected Q-values.\n", + "- It then backpropagates this loss through the Q-Network and updates the weights using the optimizer.\n", + "\n", + "#### Target Network Update\n", + "Finally, the `update_target_network` function copies the weights from the Q-Network to the Target Network. This is done periodically (not every step), to stabilize the learning process. Without this, the Q-Network would be trying to follow a moving target, since it's learning from estimates produced by itself." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "class DQNAgent:\n", + " '''\n", + " This class represents a Deep Q-Learning agent that uses a Deep Q-Network (DQN) and a replay memory to interact \n", + " with its environment.\n", + "\n", + " Parameters\n", + " ----------\n", + " state_size: int, default=8\n", + " The size of the state space.\n", + " action_size: int, default=4\n", + " The size of the action space.\n", + " hidden_size: int, default=64\n", + " The size of the hidden layers in the network.\n", + " learning_rate: float, default=1e-3\n", + " The learning rate for the optimizer.\n", + " gamma: float, default=0.99\n", + " The discount factor for future rewards.\n", + " buffer_size: int, default=10000\n", + " The maximum size of the replay memory.\n", + " batch_size: int, default=64\n", + " The batch size for learning from the replay memory.\n", + " '''\n", + " def __init__(self, state_size=8, action_size=4, hidden_size=64, \n", + " learning_rate=1e-3, gamma=0.99, buffer_size=10000, batch_size=64):\n", + " # Select device to train on (if CUDA available, use it, otherwise use CPU)\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " \n", + " # Discount factor for future rewards\n", + " self.gamma = gamma\n", + "\n", + " # Batch size for sampling from the replay memory\n", + " self.batch_size = batch_size\n", + "\n", + " # Number of possible actions\n", + " self.action_size = action_size\n", + "\n", + " # Initialize the Q-Network and Target Network with the given state size, action size and hidden layer size\n", + " # Move the networks to the selected device\n", + " self.q_network = DQN(state_size, action_size, hidden_size).to(self.device)\n", + " self.target_network = DQN(state_size, action_size, hidden_size).to(self.device)\n", + " \n", + " # Set weights of target network to be the same as those of the q network\n", + " self.target_network.load_state_dict(self.q_network.state_dict())\n", + " \n", + " # Set target network to evaluation mode\n", + " self.target_network.eval()\n", + "\n", + " # Initialize the optimizer for updating the Q-Network's parameters\n", + " self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=learning_rate)\n", + " \n", + " # Initialize the replay memory\n", + " self.memory = ReplayBuffer(buffer_size)\n", + "\n", + " def step(self, state, action, reward, next_state, done):\n", + " '''\n", + " Perform a step in the environment, store the experience in the replay memory and potentially update the Q-network.\n", + "\n", + " Parameters\n", + " ----------\n", + " state: array-like\n", + " The current state of the environment.\n", + " action: int\n", + " The action taken by the agent.\n", + " reward: float\n", + " The reward received after taking the action.\n", + " next_state: array-like\n", + " The state of the environment after taking the action.\n", + " done: bool\n", + " A flag indicating whether the episode has ended after taking the action.\n", + " '''\n", + " # Store the experience in memory\n", + " self.memory.push(state, action, reward, next_state, done)\n", + " \n", + " # If there are enough experiences in memory, perform a learning step\n", + " if len(self.memory) > self.batch_size:\n", + " self.update_model()\n", + "\n", + " def act(self, state, eps=0.):\n", + " '''\n", + " Choose an action based on the current state and the epsilon-greedy policy.\n", + "\n", + " Parameters\n", + " ----------\n", + " state: array-like\n", + " The current state of the environment.\n", + " eps: float, default=0.\n", + " The epsilon for the epsilon-greedy policy. With probability eps, a random action is chosen.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The chosen action.\n", + " '''\n", + " # If a randomly chosen value is greater than eps\n", + " if random.random() > eps: \n", + " # Convert state to a PyTorch tensor and set network to evaluation mode\n", + " state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) \n", + " self.q_network.eval() \n", + "\n", + " # With no gradient updates, get the action values from the DQN\n", + " with torch.no_grad():\n", + " action_values = self.q_network(state)\n", + "\n", + " # Revert to training mode and return action\n", + " self.q_network.train() \n", + " return np.argmax(action_values.cpu().data.numpy())\n", + " else:\n", + " # Return a random action for random value > eps\n", + " return random.choice(np.arange(self.action_size)) \n", + " \n", + " def update_model(self):\n", + " '''\n", + " Update the Q-network based on a batch of experiences from the replay memory.\n", + " '''\n", + " # Sample a batch of experiences from memory\n", + " states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)\n", + "\n", + " # Convert numpy arrays to PyTorch tensors\n", + " states = torch.from_numpy(states).float().to(self.device)\n", + " actions = torch.from_numpy(np.array(actions)).long().to(self.device)\n", + " rewards = torch.from_numpy(np.array(rewards)).float().to(self.device)\n", + " next_states = torch.from_numpy(next_states).float().to(self.device)\n", + " dones = torch.from_numpy(np.array(dones).astype(np.uint8)).float().to(self.device)\n", + "\n", + " # Get Q-values for the actions that were actually taken\n", + " q_values = self.q_network(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)\n", + " \n", + " # Get maximum Q-value for the next states from target network\n", + " next_q_values = self.target_network(next_states).max(1)[0].detach()\n", + " \n", + " # Compute the expected Q-values\n", + " expected_q_values = rewards + self.gamma * next_q_values * (1 - dones)\n", + "\n", + " # Compute the loss between the current and expected Q values\n", + " loss = torch.nn.MSELoss()(q_values, expected_q_values)\n", + " \n", + " # Zero all gradients\n", + " self.optimizer.zero_grad()\n", + " \n", + " # Backpropagate the loss\n", + " loss.backward()\n", + " \n", + " # Step the optimizer\n", + " self.optimizer.step()\n", + "\n", + " def update_target_network(self):\n", + " '''\n", + " Update the weights of the target network to match those of the Q-network.\n", + " '''\n", + " self.target_network.load_state_dict(self.q_network.state_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training the Agent\n", + "\n", + "Training the agent involves having the agent interact with the `LunarLander-v2` environment over a sequence of steps. Over each step, the agent receives a state from the environment, selects an action, receives a reward and the next state, and then updates its understanding of the environment (the Q-table in the case of Q-Learning).\n", + "\n", + "The `train` function orchestrates this process over a defined number of episodes, using the methods defined in the DQNAgent class. Here's how it works:\n", + "\n", + "#### Initial Setup\n", + "- `scores`: This list stores the total reward obtained in each episode.\n", + "- `scores_window`: This is a double-ended queue with a maximum length of 100. It holds the scores of the most recent 100 episodes and is used to monitor the agent's performance.\n", + "-`eps`: This is the epsilon for epsilon-greedy action selection. It starts from `eps_start` and decays after each episode until it reaches `eps_end`.\n", + "\n", + "#### Episode Loop\n", + "The training process runs over a fixed number of episodes. In each episode:\n", + "\n", + "- The environment is reset to its initial state.\n", + "- he agent then interacts with the environment until the episode is done (when a terminal state is reached).\n", + "\n", + "#### Step Loop\n", + "In each step of an episode:\n", + "\n", + "- The agent selects an action using the current policy (the act method in `DQNAgent`).\n", + "The selected action is applied to the environment using the step method, which returns the next state, the reward, and a boolean indicating whether the episode is done.\n", + "- The agent's step method is called to update the agent's knowledge. This involves adding the experience to the replay buffer and, if enough experiences have been collected, triggering a learning update.\n", + "- The state is updated to the next state, and the reward is added to the score.\n", + "\n", + "After each episode:\n", + "\n", + "- The score for the episode is added to `scores` and `scores_window`.\n", + "- Epsilon is decayed according to `eps_decay`.\n", + "- If the episode is a multiple of `target_update`, the target network is updated with the latest weights from the Q-Network.\n", + "- Finally, every 100 episodes, the trained model will be saved and the average score over the last 100 episodes is printed. \n", + "\n", + "The function returns the list of scores for all episodes.\n", + "\n", + "This training process, which combines experiences from the replay buffer and separate target and Q networks, helps to stabilize the learning and leads to a more robust policy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train function with model saving every 100 episodes\n", + "def train(agent, env, n_episodes=2000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, target_update=10, save_dir=\"models\"):\n", + " '''\n", + " Train a DQN agent and save the model every 100 episodes.\n", + " \n", + " Parameters\n", + " ----------\n", + " agent: DQNAgent\n", + " The agent to be trained.\n", + " env: gym.Env\n", + " The environment in which the agent is trained.\n", + " n_episodes: int, default=2000\n", + " The number of episodes for which to train the agent.\n", + " eps_start: float, default=1.0\n", + " The starting epsilon for epsilon-greedy action selection.\n", + " eps_end: float, default=0.01\n", + " The minimum value that epsilon can reach.\n", + " eps_decay: float, default=0.995\n", + " The decay rate for epsilon after each episode.\n", + " target_update: int, default=10\n", + " The frequency (number of episodes) with which the target network should be updated.\n", + " save_dir: str, default=\"models\"\n", + " Directory where model checkpoints will be saved.\n", + " \n", + " Returns\n", + " -------\n", + " list of float\n", + " The total reward obtained in each episode.\n", + " '''\n", + " # Initialize the scores list and scores window\n", + " scores = []\n", + " scores_window = deque(maxlen=100)\n", + " eps = eps_start\n", + "\n", + " # Create save directory if it doesn't exist\n", + " if not os.path.exists(save_dir):\n", + " os.makedirs(save_dir)\n", + "\n", + " # Loop over episodes\n", + " for i_episode in range(1, n_episodes + 1):\n", + " \n", + " # Reset environment and score at the start of each episode\n", + " state, _ = env.reset()\n", + " score = 0 \n", + "\n", + " # Loop over steps\n", + " while True:\n", + " # Select an action using current agent policy then apply in environment\n", + " action = agent.act(state, eps)\n", + " next_state, reward, terminated, truncated, _ = env.step(action) \n", + " done = terminated or truncated\n", + " \n", + " # Update the agent, state and score\n", + " agent.step(state, action, reward, next_state, done)\n", + " state = next_state \n", + " score += reward\n", + "\n", + " # End the episode if done\n", + " if done:\n", + " break \n", + " \n", + " # At the end of episode append and save scores\n", + " scores_window.append(score)\n", + " scores.append(score) \n", + "\n", + " # Decrease epsilon\n", + " eps = max(eps_end, eps_decay * eps)\n", + "\n", + " # Print some info\n", + " print(f\"\\rEpisode {i_episode}\\tAverage Score: {np.mean(scores_window):.2f}\", end=\"\")\n", + "\n", + " # Update target network every target_update episodes\n", + " if i_episode % target_update == 0:\n", + " agent.update_target_network()\n", + " \n", + " # Print average score and save model every 100 episodes\n", + " if i_episode % 100 == 0:\n", + " avg_score = np.mean(scores_window)\n", + " print(f'\\rEpisode {i_episode}\\tAverage Score: {avg_score:.2f}')\n", + " # Save the q_network state_dict\n", + " save_path = os.path.join(save_dir, f'dqn_model_episode_{i_episode}.pth')\n", + " torch.save(agent.q_network.state_dict(), save_path)\n", + " print(f\"Saved model to {save_path}\")\n", + " \n", + " # Stop training if solved (mean score >= 200)\n", + " if i_episode % 100 == 0 and np.mean(scores_window) >= 200:\n", + " print(\"Environment solved!\")\n", + " # Save final model\n", + " save_path = os.path.join(save_dir, 'dqn_model_final.pth')\n", + " torch.save(agent.q_network.state_dict(), save_path)\n", + " print(f\"Saved final model to {save_path}\")\n", + " break\n", + "\n", + " return scores\n", + "\n", + "# Make an environment\n", + "env = gym.make('LunarLander-v3')\n", + "state_size = env.observation_space.shape[0]\n", + "action_size = env.action_space.n\n", + "\n", + "# Initialize a DQN agent\n", + "agent = DQNAgent(state_size, action_size) # Adjust parameters as needed\n", + "\n", + "# Train it and save models every 100 episodes\n", + "scores = train(agent, env, n_episodes=2000, save_dir=\"lunar_lander_models\")\n", + "\n", + "# Close environment\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Observations:\n", + "- Our DQN agent is able to solve the game typically after playing around 1200 episodes.\n", + "- Let's watch a video of this agent's performance:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize the trained DQN agent\n", + "We can load a trained model an agent and play a game\n", + "#### Load a trained DQN agent:\n", + "- Load a given trained DQN to an agent" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to load a model into an agent\n", + "def load_model(agent, model_path):\n", + " \"\"\"\n", + " Load a saved model into the agent's q_network.\n", + " \n", + " Parameters\n", + " ----------\n", + " agent: DQNAgent\n", + " The agent to load the model into.\n", + " model_path: str\n", + " Path to the saved .pth file (e.g., \"models/dqn_model_episode_200.pth\").\n", + " \"\"\"\n", + " try:\n", + " state_dict = torch.load(model_path)\n", + " agent.q_network.load_state_dict(state_dict)\n", + " agent.q_network.eval() # Set to evaluation mode\n", + " print(f\"Loaded model from {model_path}\")\n", + " except FileNotFoundError:\n", + " print(f\"Error: Model file {model_path} not found.\")\n", + " raise\n", + " except Exception as e:\n", + " print(f\"Error loading model: {e}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### A function that plays the episode:\n", + "Each step of the enviroment is shown until a episode is ended." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def play_DQN_episode(env, agent, render=False, delay=0.05):\n", + " score = 0\n", + " # if use seed \n", + " # state, _ = env.reset(seed=42)\n", + " state, _ = env.reset()\n", + " while True:\n", + " action = agent.act(state, 0)\n", + " state, reward, terminated, truncated, _ = env.step(action)\n", + " done = terminated or truncated\n", + " score += reward\n", + " if render:\n", + " env.render()\n", + " if delay > 0:\n", + " import time\n", + " time.sleep(delay) # Slow down rendering\n", + " if done:\n", + " break\n", + " return score\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load a trained agent and plays the episode:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to demo a DQN model\n", + "def demo_dqn_model(model_path, render=True, delay=0.05):\n", + " \"\"\"\n", + " Demo a trained DQN model by playing an episode in LunarLander-v3.\n", + " \n", + " Parameters\n", + " ----------\n", + " model_path: str\n", + " Path to the saved model file (e.g., \"lunar_lander_models/dqn_model_episode_200.pth\").\n", + " render: bool, default=True\n", + " Whether to render the episode.\n", + " delay: float, default=0.05\n", + " Delay in seconds between frames for rendering.\n", + " \n", + " Returns\n", + " -------\n", + " float\n", + " The score obtained in the demo episode.\n", + " \"\"\"\n", + " # Create environment with render_mode=\"human\" rendering \n", + " env = gym.make(\"LunarLander-v3\", render_mode=\"human\")\n", + " state_size = env.observation_space.shape[0]\n", + " action_size = env.action_space.n\n", + "\n", + " # Initialize a DQN agent\n", + " agent = DQNAgent(state_size, action_size) # Adjust parameters as needed\n", + " \n", + " # Load the saved model\n", + " load_model(agent, model_path)\n", + "\n", + " # Play a demo episode\n", + " print(\"\\nStarting demo...\")\n", + " score = play_DQN_episode(env, agent, render=render, delay=delay)\n", + " print(\"Score obtained:\", score)\n", + "\n", + " # Close the environment\n", + " env.close()\n", + " return score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Test it by yourself by provided a tranined agent\n", + "Replace the model path xxxx with 100, 200, 300, ...\n", + "```\n", + " model_path = \"models_xu/dqn_model_episodeXXXX.pth\" \n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def download_and_unzip(url, output_dir=\".\"):\n", + " \"\"\"\n", + " Download a zip file from a URL and unzip it to the specified directory.\n", + " \n", + " Parameters\n", + " ----------\n", + " url: str\n", + " The raw URL of the zip file to download (e.g., GitHub raw link).\n", + " output_dir: str, default=\".\"\n", + " The directory where the zip file will be saved and extracted (default is current directory).\n", + " \"\"\"\n", + " # Define the local filename\n", + " zip_filename = os.path.join(output_dir, \"models_xu.zip\")\n", + "\n", + " # Download the file\n", + " print(f\"Downloading from {url}...\")\n", + " response = requests.get(url, stream=True)\n", + " if response.status_code == 200:\n", + " with open(zip_filename, 'wb') as f:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " if chunk: # Filter out keep-alive chunks\n", + " f.write(chunk)\n", + " print(f\"Downloaded to {zip_filename}\")\n", + " else:\n", + " raise Exception(f\"Failed to download file. Status code: {response.status_code}\")\n", + "\n", + " # Unzip the file\n", + " print(f\"Unzipping {zip_filename} to {output_dir}...\")\n", + " with zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n", + " zip_ref.extractall(output_dir)\n", + " print(\"Unzipping complete.\")\n", + "\n", + " # Optionally, remove the zip file after extraction\n", + " os.remove(zip_filename)\n", + " print(f\"Removed {zip_filename}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading from https://github.com/frankwxu/AI4DigitalForensics/raw/main/lab10_Reinforcement_Learning/models_xu.zip...\n", + "Downloaded to .\\models_xu.zip\n", + "Unzipping .\\models_xu.zip to ....\n", + "Unzipping complete.\n", + "Removed .\\models_xu.zip\n", + "Files extracted to current directory:\n", + "['dqn-on-lunar-lander-rl.ipynb', 'dqn_lunar_lander_demo.ipynb', 'graph_rl.ipynb', 'lunarlanderv3_kaggle.ipynb', 'lunarlanderv3_kaggle_v2.ipynb', 'models_xu', 'testgym.ipynb']\n", + "Loaded model from models_xu/dqn_model_episode_1600.pth\n", + "\n", + "Starting demo...\n", + "Score obtained: 226.3022059875887\n" + ] + } + ], + "source": [ + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " # Correct raw GitHub URL for direct download\n", + " url = \"https://github.com/frankwxu/AI4DigitalForensics/raw/main/lab10_Reinforcement_Learning/models_xu.zip\"\n", + "\n", + " # Run the download and unzip\n", + " download_and_unzip(url)\n", + " print(\"Files extracted to current directory:\")\n", + " print(os.listdir(\".\")) # List files to verify\n", + " \n", + " # This is instructor's trained model downloaded from GitHub\n", + " # Your trained model path should be under the folder \"models\" instead of \"models_xu\" \n", + " model_path = \"models_xu/dqn_model_episode_1600.pth\" # Change to your desired model\n", + " # Use the following to tet your trained model\n", + "\n", + " score = demo_dqn_model(model_path, render=True, delay=0.000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab10_Reinforcement_Learning/labxx.ipynb b/lab10_Reinforcement_Learning/labxx.ipynb deleted file mode 100644 index e69de29..0000000