diff --git a/docs/source/using-diffusers/other-modalities.mdx b/docs/source/using-diffusers/other-modalities.mdx
index 1dc0877adb..3e1cdbde80 100644
--- a/docs/source/using-diffusers/other-modalities.mdx
+++ b/docs/source/using-diffusers/other-modalities.mdx
@@ -14,7 +14,8 @@ specific language governing permissions and limitations under the License.
 
 Diffusers is in the process of expanding to modalities other than images.
 
-Currently, one example is for [molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation.
-* Generate conformations in Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb)
+Example type        | Colab | Pipeline |
+:-------------------------:|:-------------------------:|:-------------------------:|
+[Molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) | ❌
 
 More coming soon!
\ No newline at end of file
diff --git a/docs/source/using-diffusers/rl.mdx b/docs/source/using-diffusers/rl.mdx
index 6e18e07001..e74cee742f 100644
--- a/docs/source/using-diffusers/rl.mdx
+++ b/docs/source/using-diffusers/rl.mdx
@@ -13,6 +13,13 @@ specific language governing permissions and limitations under the License.
 # Using Diffusers for reinforcement learning
 
 Support for one RL model and related pipelines is included in the `experimental` source of diffusers.
+More models and examples coming soon!
 
-To try some of this in colab, please look at the following example:
-* Model-based reinforcement learning on Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)
+# Diffuser Value-guided Planning
+
+You can run the model from [*Planning with Diffusion for Flexible Behavior Synthesis*](https://arxiv.org/abs/2205.09991) with Diffusers.
+The script is located in the [RL Examples](https://github.com/huggingface/diffusers/tree/main/examples/rl) folder.
+
+Or, run this example in Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb)
+
+[[autodoc]] diffusers.experimental.ValueGuidedRLPipeline
\ No newline at end of file
diff --git a/examples/rl/README.md b/examples/rl/README.md
index d68f2bf780..17881d584a 100644
--- a/examples/rl/README.md
+++ b/examples/rl/README.md
@@ -1,9 +1,12 @@
 # Overview
 
-These examples show how to run (Diffuser)[https://arxiv.org/abs/2205.09991] in Diffusers. 
-There are four scripts, 
-1. `run_diffuser_locomotion.py` to sample actions and run them in the environment,
-2. and `run_diffuser_gen_trajectories.py` to just sample actions from the pre-trained diffusion model.
+These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers. 
+There are two ways to use the script, `run_diffuser_locomotion.py`.
+
+The key option is a change of the variable `n_guide_steps`. 
+When `n_guide_steps=0`, the trajectories are sampled from the diffusion model, but not fine-tuned to maximize reward in the environment.
+By default, `n_guide_steps=2` to match the original implementation.
+ 
 
 You will need some RL specific requirements to run the examples:
 
diff --git a/examples/rl/run_diffuser_gen_trajectories.py b/examples/rl/run_diffuser_gen_trajectories.py
deleted file mode 100644
index 5bb068cc9f..0000000000
--- a/examples/rl/run_diffuser_gen_trajectories.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import d4rl  # noqa
-import gym
-import tqdm
-from diffusers.experimental import ValueGuidedRLPipeline
-
-
-config = dict(
-    n_samples=64,
-    horizon=32,
-    num_inference_steps=20,
-    n_guide_steps=0,
-    scale_grad_by_std=True,
-    scale=0.1,
-    eta=0.0,
-    t_grad_cutoff=2,
-    device="cpu",
-)
-
-
-if __name__ == "__main__":
-    env_name = "hopper-medium-v2"
-    env = gym.make(env_name)
-
-    pipeline = ValueGuidedRLPipeline.from_pretrained(
-        "bglick13/hopper-medium-v2-value-function-hor32",
-        env=env,
-    )
-
-    env.seed(0)
-    obs = env.reset()
-    total_reward = 0
-    total_score = 0
-    T = 1000
-    rollout = [obs.copy()]
-    try:
-        for t in tqdm.tqdm(range(T)):
-            # Call the policy
-            denorm_actions = pipeline(obs, planning_horizon=32)
-
-            # execute action in environment
-            next_observation, reward, terminal, _ = env.step(denorm_actions)
-            score = env.get_normalized_score(total_reward)
-            # update return
-            total_reward += reward
-            total_score += score
-            print(
-                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
-                f" {total_score}"
-            )
-            # save observations for rendering
-            rollout.append(next_observation.copy())
-
-            obs = next_observation
-    except KeyboardInterrupt:
-        pass
-
-    print(f"Total reward: {total_reward}")
diff --git a/examples/rl/run_diffuser_locomotion.py b/examples/rl/run_diffuser_locomotion.py
index e89181610b..e64a20500b 100644
--- a/examples/rl/run_diffuser_locomotion.py
+++ b/examples/rl/run_diffuser_locomotion.py
@@ -8,7 +8,7 @@ config = dict(
     n_samples=64,
     horizon=32,
     num_inference_steps=20,
-    n_guide_steps=2,
+    n_guide_steps=2,  # can set to 0 for faster sampling, does not use value network
     scale_grad_by_std=True,
     scale=0.1,
     eta=0.0,
@@ -40,6 +40,7 @@ if __name__ == "__main__":
             # execute action in environment
             next_observation, reward, terminal, _ = env.step(denorm_actions)
             score = env.get_normalized_score(total_reward)
+
             # update return
             total_reward += reward
             total_score += score
@@ -47,6 +48,7 @@ if __name__ == "__main__":
                 f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
                 f" {total_score}"
             )
+
             # save observations for rendering
             rollout.append(next_observation.copy())
 
diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py
index 4dd935f54d..27bef08182 100644
--- a/src/diffusers/experimental/rl/value_guided_sampling.py
+++ b/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -23,6 +23,22 @@ from ...utils.dummy_pt_objects import DDPMScheduler
 
 
 class ValueGuidedRLPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Pipeline for sampling actions from a diffusion model trained to predict sequences of states.
+
+    Original implementation inspired by this repository: https://github.com/jannerm/diffuser.
+
+    Parameters:
+        value_function ([`UNet1DModel`]): A specialized UNet for fine-tuning trajectories base on reward.
+        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded trajectories.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
+            application is [`DDPMScheduler`].
+        env: An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
+    """
+
     def __init__(
         self,
         value_function: UNet1DModel,
@@ -78,21 +94,26 @@ class ValueGuidedRLPipeline(DiffusionPipeline):
             for _ in range(n_guide_steps):
                 with torch.enable_grad():
                     x.requires_grad_()
+
+                    # permute to match dimension for pre-trained models
                     y = self.value_function(x.permute(0, 2, 1), timesteps).sample
                     grad = torch.autograd.grad([y.sum()], [x])[0]
 
                     posterior_variance = self.scheduler._get_variance(i)
                     model_std = torch.exp(0.5 * posterior_variance)
                     grad = model_std * grad
+
                 grad[timesteps < 2] = 0
                 x = x.detach()
                 x = x + scale * grad
                 x = self.reset_x0(x, conditions, self.action_dim)
+
             prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
-            # TODO: set prediction_type when instantiating the model
+
+            # TODO: verify deprecation of this kwarg
             x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
 
-            # apply conditions to the trajectory
+            # apply conditions to the trajectory (set the initial state)
             x = self.reset_x0(x, conditions, self.action_dim)
             x = self.to_torch(x)
         return x, y
@@ -126,5 +147,6 @@ class ValueGuidedRLPipeline(DiffusionPipeline):
         else:
             # if we didn't run value guiding, select a random action
             selected_index = np.random.randint(0, batch_size)
+
         denorm_actions = denorm_actions[selected_index, 0]
         return denorm_actions