Basics of the Actor-Critic Algorithm

Let's simply implement an Actor that uses a neural network to generate actions, we'll do this using TensorFlow:

import tensorflow as tf
from tensorflow.keras import layers

class SimpleActorModel(tf.keras.Model):
    def __init__(self, action_size):
        super(SimpleActorModel, self).__init__()
        self.dense1 = layers.Dense(128, activation="relu")
        self.dense2 = layers.Dense(128, activation="relu")
        self.output_action = layers.Dense(action_size, activation="softmax")

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.output_action(x)

# используем
action_size = 2 # предположим, есть 2 возможных действия
model = SimpleActorModel(action_size)

state = tf.constant([[0.1, 0.2, 0.3]]) # пример состояния окружающей среды
action_probabilities = model(state)
print("Распределение вероятностей действий:", action_probabilities.numpy())

SimpleActorModel takes the state of the environment as input, in our case it is a vector, and outputs a probability distribution over possible actions. It uses two hidden layers with ReLU activation to process the state and softmax as the output to get the probabilities for each action.

Critic

Critic evaluates how good or bad the Actor's chosen action is in terms of achieving the final goal. This is done by calculating a value function that estimates the future rewards received for a certain action or being in a certain state. Critic attempts to predict what reward an agent can expect in the long run, given the current state and the action being taken.

To implement Critic you can use TensorFlow:

import tensorflow as tf
from tensorflow.keras import layers

class SimpleCriticModel(tf.keras.Model):
    def __init__(self):
        super(SimpleCriticModel, self).__init__()
        self.dense1 = layers.Dense(128, activation="relu")
        self.dense2 = layers.Dense(128, activation="relu")
        self.value = layers.Dense(1)

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.value(x)

# пример использования
model = SimpleCriticModel()

state = tf.constant([[0.1, 0.2, 0.3]]) # пример состояния окружающей среды
value = model(state)
print("Предсказанная ценность состояния:", value.numpy())

SimpleCriticModel analyzes the current state of the environment and returns an estimate of the value of that state, Critic produces a numeric value representing the expected reward.

Actor-Critic algorithm

And so Actor-Critic combines two components.

The Actor selects an action based on the current state of the environment.

After performing an action, Critic evaluates it using a value function. This score shows how good the chosen action was in terms of expected long-term reward.

Based on Critic's assessment, Actor adjusts its action strategy. This can be implemented, for example, through the mechanism gradient ascension by policy, where the Actor updates its parameters in a direction that increases the expected reward.

Critic also updates its parameters based on the reward received and the difference between the expected and received rewards to make more accurate predictions in the future.

Implementation examples

Basic example with PyTorch:

import torch
import torch.nn as nn
import torch.optim as optim

class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, state):
        return torch.softmax(self.linear(state), dim=-1)

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, state):
        return self.linear(state)

def train(actor, critic, state, action, reward, next_state, done):
    optimizer_actor = optim.Adam(actor.parameters(), lr=1e-3)
    optimizer_critic = optim.Adam(critic.parameters(), lr=1e-3)
    
    # Critic update
    value = critic(state)
    next_value = critic(next_state)
    td_error = reward + (1 - done) * 0.99 * next_value - value
    critic_loss = td_error.pow(2)

    optimizer_critic.zero_grad()
    critic_loss.backward()
    optimizer_critic.step()

    # Actor update
    log_prob = torch.log(actor(state)[action])
    actor_loss = -log_prob * td_error.detach()

    optimizer_actor.zero_grad()
    actor_loss.backward()
    optimizer_actor.step()

Implementation on TensorFlow:

import tensorflow as tf
from tensorflow.keras import layers

class ActorCritic(tf.keras.Model):
    def __init__(self, num_actions):
        super(ActorCritic, self).__init__()
        self.common = layers.Dense(128, activation='relu')
        self.actor = layers.Dense(num_actions, activation='softmax')
        self.critic = layers.Dense(1)

    def call(self, inputs):
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

model = ActorCritic(num_actions=4)
optimizer = tf.keras.optimizers.Adam(lr=0.01)

def train_step(state, action, reward, next_state, done):
    with tf.GradientTape() as tape:
        action_probs, critic_value = model(state)
        _, critic_value_next = model(next_state)
        action_log_probs = tf.math.log(action_probs[0, action])
        td_error = reward + 0.99 * critic_value_next * (1 - done) - critic_value
        actor_loss = -action_log_probs * tf.stop_gradient(td_error)
        critic_loss = td_error**2

    grads = tape.gradient([actor_loss, critic_loss], model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

TensorFlow and Keras with Functional API:

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

def create_actor(input_shape, output_shape):
    inputs = Input(shape=input_shape)
    x = Dense(64, activation='relu')(inputs)
    outputs = Dense(output_shape, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model


def create_critic(input_shape):
    inputs = Input(shape=input_shape)
    x = Dense(64, activation='relu')(inputs)
    outputs = Dense(1)(x)
    model = Model(inputs, outputs)
    return model

actor = create_actor(input_shape=(4,), output_shape=2)
critic = create_critic(input_shape=(4,))

PyTorch with parallel update:

import torch
import torch.nn as nn
import torch.optim as optim

class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.common = nn.Linear(input_dim, 64)
        self.actor = nn.Linear(64, action_dim)
        self.critic = nn.Linear(64, 1)
    
    def forward(self, state):
        x = torch.relu(self.common(state))
        return torch.softmax(self.actor(x), dim=-1), self.critic(x)

def train(model, state, action, reward, next_state, done):
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    
    # Compute loss
    action_probs, value = model(state)
    _, next_value = model(next_state)
    td_error = reward + (1 - done) * 0.99 * next_value - value
    action_log_probs = torch.log(action_probs[action])
    actor_loss = -action_log_probs * td_error.detach()
    critic_loss = td_error.pow(2)

    # Optimize the model
    optimizer.zero_grad()
    (actor_loss + critic_loss).backward()
    optimizer.step()

Advantage Actor-Critic

Advantage Actor-Critic focuses on the balance between evaluating the actor's current actions and criticizing those actions using the advantage function.

(Advantage Function, A(s,a)) calculates the difference between the expected reward for a selected action and the average expected reward in a given state. Formally, A(s,a) = Q(s,a) – V(s)Where Q(s,a) is the expected reward for an action a able sA V(s) is the expected reward in the state s, independent of action. This allows the actor to update the policy in the direction of increasing advantage, that is, to select actions that perform better than the average for a given state.

Asynchronous Advantage Actor-Critic (A3C): one of the most famous variants of A2C, where several actors explore space in parallel and update the global model asynchronously, example:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import threading
import gym

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(ActorCritic, self).__init__()
        self.critic = nn.Linear(num_inputs, 1)
        self.actor = nn.Linear(num_inputs, num_actions)
    
    def forward(self, x):
        value = self.critic(x)
        probs = torch.softmax(self.actor(x), dim=-1)
        return probs, value

def worker(global_model, optimizer, env_name, global_results, lock, worker_id):
    local_model = ActorCritic(num_inputs, num_actions)
    local_model.load_state_dict(global_model.state_dict())
    
    env = gym.make(env_name)
    state = env.reset()
    
    while True:
        # код для выполнения действий агентом и обновления модели
        
        with lock:
            global_model.load_state_dict(local_model.state_dict())

# глобал инициализация и запуск воркеров
global_model = ActorCritic(num_inputs, num_actions)
optimizer = optim.Adam(global_model.parameters())
workers = [threading.Thread(target=worker, args=(global_model, optimizer, 'CartPole-v1', global_results, lock, i)) for i in range(num_workers)]
for w in workers:
    w.start()
for w in workers:
    w.join()

Synchronous Advantage Actor-Critic (Synchronous A2C or A2C): Unlike A3C, A2C updates the global model synchronously using the average gradients from all actors, this reduces the variance of updates to some extent, although it can potentially be slower compared to the asynchronous approach. Example:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym

class ActorCritic(nn.Module):
    # конструктор и forward метод аналогичны A3C

def update_global(optimizer, global_model, loss):
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

env = gym.make('CartPole-v1')
global_model = ActorCritic(num_inputs, num_actions)
optimizer = optim.Adam(global_model.parameters())

# синхронное обновление
for episode in range(total_episodes):
    state = env.reset()
    done = False
    while not done:
        # выполнение действий и сбор градиентов для обновления
        loss = compute_loss(...)  # функция вычисления потерь на основе действий и оценок
        update_global(optimizer, global_model, loss)

Proximal Policy Optimization (PPO): Although not a direct variant of A2C, PPO takes the ideas of Actor-Critic methods further by introducing a loss function that limits policy changes:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(ActorCritic, self).__init__()
        self.critic = nn.Linear(num_inputs, 1)
        self.actor = nn.Linear(num_inputs, num_actions)
    
    def forward(self, x):
        value = self.critic(x)
        probs = torch.softmax(self.actor(x), dim=-1)
        return probs, value

def ppo_loss(old_log_probs, new_log_probs, advantages, clip_param=0.2):
    ratios = torch.exp(new_log_probs - old_log_probs)
    surr1 = ratios * advantages
    surr2 = torch.clamp(ratios, 1.0 - clip_param, 1.0 + clip_param) * advantages
    return -torch.min(surr1, surr2).mean()

def compute_advantages(rewards, values, gamma=0.99, tau=0.95):
    advantages = torch.zeros_like(rewards)
    gae = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        gae = delta + gamma * tau * gae
        advantages[t] = gae
    return advantages

# инициализация среды, модели и оптимизатора
env = gym.make('CartPole-v1')
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n
model = ActorCritic(num_inputs, num_actions)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):  # Пример количества эпох обучения
    # реализация сбора данных среды, выполнения действий и т.д.
    
    # предполагается, что мы уже собрали данные о состояниях, действиях, наградах
    # states, actions, rewards, next_states, dones
    
    # вычисление логарифмов вероятностей, преимуществ, функции потерь и выполнение шага оптимизации
    old_log_probs = ...  # логарифмы старых вероятностей действий
    new_log_probs = ...  # ллогарифмы новых вероятностей действий, полученных из модели
    advantages = ...  # преимущества, вычисленные с использованием функции compute_advantages
    
    loss = ppo_loss(old_log_probs, new_log_probs, advantages)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In conclusion, I would like to invite you to a free webinar about using the FinRL framework to model a sales agent. Registration is available via this link.