diffusers/examples/reinforcement_learning/run_diffuser_locomotion.py

import d4rl  # noqa
import gym
import tqdm
from diffusers.experimental import ValueGuidedRLPipeline


config = {
    "n_samples": 64,
    "horizon": 32,
    "num_inference_steps": 20,
    "n_guide_steps": 2,  # can set to 0 for faster sampling, does not use value network
    "scale_grad_by_std": True,
    "scale": 0.1,
    "eta": 0.0,
    "t_grad_cutoff": 2,
    "device": "cpu",
}


if __name__ == "__main__":
    env_name = "hopper-medium-v2"
    env = gym.make(env_name)

    pipeline = ValueGuidedRLPipeline.from_pretrained(
        "bglick13/hopper-medium-v2-value-function-hor32",
        env=env,
    )

    env.seed(0)
    obs = env.reset()
    total_reward = 0
    total_score = 0
    T = 1000
    rollout = [obs.copy()]
    try:
        for t in tqdm.tqdm(range(T)):
            # call the policy
            denorm_actions = pipeline(obs, planning_horizon=32)

            # execute action in environment
            next_observation, reward, terminal, _ = env.step(denorm_actions)
            score = env.get_normalized_score(total_reward)

            # update return
            total_reward += reward
            total_score += score
            print(
                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
                f" {total_score}"
            )

            # save observations for rendering
            rollout.append(next_observation.copy())

            obs = next_observation
    except KeyboardInterrupt:
        pass

    print(f"Total reward: {total_reward}")