I'm running into an internal error complaining about the wrong data type (expected float32, get int32) when I try running the quickstart code (d4pg agent) on lunar lander and mountain car. Here is the code to generate the error:
# python3
# Copyright 2018 DeepMind Technologies Limited. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Example running D4PG on the OpenAI Gym."""
from typing import Mapping, Sequence
import sys
from absl import app
from absl import flags
import acme
from acme import specs
from acme import types
from acme import wrappers
from acme.agents.tf import actors
from acme.agents.tf import d4pg
from acme.tf import networks
from acme.tf import utils as tf2_utils
import dm_env
import gym
import numpy as np
import sonnet as snt
FLAGS = flags.FLAGS
flags.DEFINE_integer('num_episodes', 100,
'Number of training episodes to run for.')
flags.DEFINE_integer('num_episodes_per_eval', 10,
'Number of training episodes to run between evaluation '
'episodes.')
def make_environment(
task: str = 'LunarLanderContinuous-v2') -> dm_env.Environment:
"""Creates an OpenAI Gym environment."""
# Load the gym environment.
environment = gym.make(task)
# Make sure the environment obeys the dm_env.Environment interface.
environment = wrappers.GymWrapper(environment)
environment = wrappers.SinglePrecisionWrapper(environment)
return environment
# The default settings in this network factory will work well for the
# MountainCarContinuous-v0 task but may need to be tuned for others. In
# particular, the vmin/vmax and num_atoms hyperparameters should be set to
# give the distributional critic a good dynamic range over possible discounted
# returns. Note that this is very different than the scale of immediate rewards.
def make_networks(
action_spec: specs.BoundedArray,
policy_layer_sizes: Sequence[int] = (256, 256, 256),
critic_layer_sizes: Sequence[int] = (512, 512, 256),
vmin: float = -150.,
vmax: float = 150.,
num_atoms: int = 51,
) -> Mapping[str, types.TensorTransformation]:
"""Creates the networks used by the agent."""
# Get total number of action dimensions from action spec.
num_dimensions = np.prod(action_spec.shape, dtype=int)
# Create the shared observation network; here simply a state-less operation.
observation_network = tf2_utils.batch_concat
# Create the policy network.
policy_network = snt.Sequential([
networks.LayerNormMLP(policy_layer_sizes, activate_final=True),
networks.NearZeroInitializedLinear(num_dimensions),
networks.TanhToSpec(action_spec),
])
# Create the critic network.
critic_network = snt.Sequential([
# The multiplexer concatenates the observations/actions.
networks.CriticMultiplexer(),
networks.LayerNormMLP(critic_layer_sizes, activate_final=True),
networks.DiscreteValuedHead(vmin, vmax, num_atoms),
])
return {
'policy': policy_network,
'critic': critic_network,
'observation': observation_network,
}
def main(_):
nbits = int(sys.argv[1])
iteration = int(sys.argv[2])
dirpath = str(sys.argv[3])
# Create an environment, grab the spec, and use it to create networks.
environment = make_environment()
environment_spec = specs.make_environment_spec(environment)
agent_networks = make_networks(environment_spec.actions)
# Construct the agent.
agent = d4pg.D4PG(
environment_spec=environment_spec,
policy_network=agent_networks['policy'],
critic_network=agent_networks['critic'],
observation_network=agent_networks['observation'],
sigma=1.0,
nbits=nbits
)
# Create the environment loop used for training.
train_loop = acme.EnvironmentLoop(environment, agent, label='%s/nbits=%d_rep=%d/train_loop_nbits=%d_rep=%d' % (dirpath, nbits, iteration, nbits, iteration))
# Create the evaluation policy.
eval_policy = snt.Sequential([
agent_networks['observation'],
agent_networks['policy'],
])
# Create the evaluation actor and loop.
eval_actor = actors.FeedForwardActor(policy_network=eval_policy)
eval_env = make_environment()
eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='%s/nbits=%d_rep=%d/eval_loop_nbits=%d_rep=%d' % (dirpath, nbits, iteration, nbits, iteration))
for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval):
train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
eval_loop.run(num_episodes=1)
if __name__ == '__main__':
app.run(main)
W0619 05:19:35.448845 47712755015296 backprop.py:1021] Calling GradientTape.gradient on a persistent tape inside its context is significantly less efficient than calling it outside the context (it causes the gradient ops to be recorded on the tape, leading to increased CPU and memory usage). Only call GradientTape.gradient inside the context if you actually want to trace the gradient in order to compute higher order derivatives.
[Learner] Critic Loss = 3.948 | Policy Loss = 0.474 | Steps = 1 | Walltime = 0
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 87 | Episode Return = -212.909 | Episodes = 10 | Steps = 1073 | Steps Per Second = 3.896
[Learner] Critic Loss = 3.893 | Policy Loss = 0.385 | Steps = 21 | Walltime = 1.026
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 113 | Episode Return = -252.268 | Episodes = 12 | Steps = 1273 | Steps Per Second = 195.374
[Learner] Critic Loss = 3.892 | Policy Loss = 0.403 | Steps = 46 | Walltime = 2.032
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 83 | Episode Return = -418.158 | Episodes = 15 | Steps = 1480 | Steps Per Second = 190.304
[Learner] Critic Loss = 3.853 | Policy Loss = 0.382 | Steps = 71 | Walltime = 3.044
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 53 | Episode Return = -151.977 | Episodes = 18 | Steps = 1692 | Steps Per Second = 196.198
[Learner] Critic Loss = 3.850 | Policy Loss = 0.340 | Steps = 97 | Walltime = 4.077
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 63 | Episode Return = -261.436 | Episodes = 21 | Steps = 1905 | Steps Per Second = 198.534
[Learner] Critic Loss = 3.807 | Policy Loss = 0.412 | Steps = 122 | Walltime = 5.090
[Data Dbg/Nbits=32 Rep=1/Train Loop Nbits=32 Rep=1] Episode Length = 101 | Episode Return = -604.001 | Episodes = 24 | Steps = 2146 | Steps Per Second = 197.962
[Learner] Critic Loss = 3.751 | Policy Loss = 0.423 | Steps = 148 | Walltime = 6.128
Traceback (most recent call last):
File "run_d4pg.py", line 143, in <module>
app.run(main)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "run_d4pg.py", line 138, in main
train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/acme/environment_loop.py", line 99, in run
self._actor.update()
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/acme/agents/agent.py", line 87, in update
self._learner.step()
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/acme/agents/tf/d4pg/learning.py", line 251, in step
fetches = self._step()
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py", line 767, in __call__
result = self._call(*args, **kwds)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py", line 794, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 2811, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1838, in _filtered_call
cancellation_manager=cancellation_manager)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1914, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 549, in call
ctx=ctx)
File "/n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Output 6 of type int32 does not match declared output type float for node node IteratorGetNext (defined at /n/janapa_reddi_lab/maxlam/conda_quarl/lib/python3.6/site-packages/acme/agents/tf/d4pg/learning.py:178)
(1) Cancelled: Function was cancelled before it was started
0 successful operations.
0 derived errors ignored. [Op:__inference__step_9555]
Function call stack:
_step -> _step
[reverb/cc/platform/default/server.cc:64] Shutting down replay server
W0619 05:20:06.683977 47712755015296 client.py:112] Writer-object deleted without calling .close explicitly.
[reverb/cc/writer.cc:231] Received error when closing the stream: [14] Socket closed
I'm not sure where the int32 is coming from, any assistance with this issue would be appreciated, thanks a lot!