Source code for pytorch_seed_rl.functional.loss

# Copyright 2020 Michael Janschek
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Collection of loss functions necessary for reinforcement learning objective calculations.
"""

import torch
import torch.nn.functional as F


[docs]def entropy(logits: torch.Tensor) -> torch.Tensor:
    """Return the entropy loss, i.e., the negative entropy of the policy.

    This can be used to discourage an RL model to converge prematurely.

    See Also
    --------
    `Entropy Regularization in Reinforcement Learning
    <https://towardsdatascience.com/entropy-regularization-in-reinforcement-learning-a6fa6d7598df>`__

    Parameters
    ----------
    logits: :py:class:`torch.Tensor`
        Logits returned by the models policy network.
    """
    policy = F.softmax(logits, dim=-1)
    log_policy = F.log_softmax(logits, dim=-1)
    return torch.sum(policy * log_policy)


[docs]def policy_gradient(logits: torch.Tensor,
                    actions: torch.Tensor,
                    advantages: torch.Tensor) -> torch.Tensor:
    """Compute the policy gradient loss.

    See Also
    --------
    `https://spinningup.openai.com
    <https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html>`__

    Parameters
    ----------
    logits: :py:class:`torch.Tensor`
        Logits returned by the models policy network.
    actions: :py:class:`torch.Tensor`
        Actions that were selected from :py:attr:`logits`
    advantages: :py:class:`torch.Tensor`
        Advantages that resulted for the related states.
    """
    cross_entropy = F.nll_loss(
        F.log_softmax(torch.flatten(logits, 0, 1), dim=-1),
        target=torch.flatten(actions, 0, 1).to(torch.long),
        reduction="none",
    )
    cross_entropy = cross_entropy.view_as(advantages)
    return torch.sum(cross_entropy * advantages.detach())