Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Variable Horizon in seals/CartPole #56

Open
lcotetur opened this issue Jul 28, 2022 · 0 comments
Open

Variable Horizon in seals/CartPole #56

lcotetur opened this issue Jul 28, 2022 · 0 comments

Comments

@lcotetur
Copy link

from imitation.algorithms.adversarial.airl import AIRL
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

import gym
import seals

learners_rewards_after_training = []
learners_rewards_before_training = []
venv = DummyVecEnv([lambda: gym.make("seals/CartPole-v0")] * 8)
learner = PPO(
        env=venv,
        policy=MlpPolicy,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
    )
reward_net = BasicShapedRewardNet(
        venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
    )
airl_trainer = AIRL(
        demonstrations=rollouts,
        demo_batch_size=1024,
        gen_replay_buffer_capacity=2048,
        n_disc_updates_per_round=4,
        venv=venv,
        gen_algo=learner,
        reward_net=reward_net
    )

for i in range(10):
     
    learner_rewards_before_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
    )
    learners_rewards_before_training.append(learner_rewards_before_training)


    airl_trainer.train(20000)  # Note: set to 300000 for better results
    learner_rewards_after_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
        ) 
    learners_rewards_after_training.append(learner_rewards_after_training)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\944136942.py in <module>
     41 
     42 
---> 43     airl_trainer.train(20000)  # Note: set to 300000 for better results
     44     learner_rewards_after_training, _ = evaluate_policy(
     45         learner, venv, 100, return_episode_rewards=True

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train(self, total_timesteps, callback)
    416         )
    417         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
--> 418             self.train_gen(self.gen_train_timesteps)
    419             for _ in range(self.n_disc_updates_per_round):
    420                 with networks.training(self.reward_train):

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train_gen(self, total_timesteps, learn_kwargs)
    385 
    386         gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
--> 387         self._check_fixed_horizon(ep_lens)
    388         gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
    389         self._gen_replay_buffer.store(gen_samples)

c:\users\stephane\documents\imitation\src\imitation\algorithms\base.py in _check_fixed_horizon(self, horizons)
     89         if len(horizons) > 1:
     90             raise ValueError(
---> 91                 f"Episodes of different length detected: {horizons}. "
     92                 "Variable horizon environments are discouraged -- "
     93                 "termination conditions leak information about reward. See"

ValueError: Episodes of different length detected: {548, 500}. Variable horizon environments are discouraged -- termination conditions leak information about reward. Seehttps://imitation.readthedocs.io/en/latest/guide/variable_horizon.html for more information. If you are SURE you want to run imitation on a variable horizon task, then please pass in the flag: `allow_variable_horizon=True`.

When trying to run demo from https://github.com/HumanCompatibleAI/imitation/blob/master/examples/4_train_airl.ipynb
with a for loop for the training steps it creates episodes of different horizons

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant