Visual (RGB) input leads to unobservable and invalid videos. #207

Felixvillas · 2024-12-04T11:14:57Z

Visual input leads to unobservable and invalid videos.
When I train the model with BC, the generated video is unwatchable when using RGB-form input.
Here is my bc.json.

{
    "algo_name": "bc",
    "experiment": {
        "name": "test",
        "validate": false,
        "logging": {
            "terminal_output_to_txt": true,
            "log_tb": true,
            "log_wandb": false,
            "wandb_proj_name": "debug"
        },
        "save": {
            "enabled": true,
            "every_n_seconds": null,
            "every_n_epochs": 50,
            "epochs": [],
            "on_best_validation": false,
            "on_best_rollout_return": false,
            "on_best_rollout_success_rate": true
        },
        "epoch_every_n_steps": 100,
        "validation_epoch_every_n_steps": 10,
        "env": null,
        "additional_envs": null,
        "render": false,
        "render_video": true,
        "keep_all_videos": false,
        "video_skip": 1,
        "rollout": {
            "enabled": true,
            "n": 50,
            "horizon": 400,
            "rate": 50,
            "warmstart": 0,
            "terminate_on_success": true
        }
    },
    "train": {
        "data": null,
        "output_dir": "../bc_trained_models",
        "num_data_workers": 0,
        "hdf5_cache_mode": "all",
        "hdf5_use_swmr": true,
        "hdf5_load_next_obs": false,
        "hdf5_normalize_obs": false,
        "hdf5_filter_key": null,
        "hdf5_validation_filter_key": null,
        "seq_length": 1,
        "pad_seq_length": true,
        "frame_stack": 1,
        "pad_frame_stack": true,
        "dataset_keys": [
            "actions",
            "rewards",
            "dones"
        ],
        "goal_mode": null,
        "cuda": true,
        "batch_size": 100,
        "num_epochs": 4000,
        "seed": 1
    },
    "algo": {
        "optim_params": {
            "policy": {
                "optimizer_type": "adam",
                "learning_rate": {
                    "initial": 0.0001,
                    "decay_factor": 0.1,
                    "epoch_schedule": [],
                    "scheduler_type": "multistep"
                },
                "regularization": {
                    "L2": 0.0
                }
            }
        },
        "loss": {
            "l2_weight": 1.0,
            "l1_weight": 0.0,
            "cos_weight": 0.0
        },
        "actor_layer_dims": [
            1024,
            1024
        ],
        "gaussian": {
            "enabled": false,
            "fixed_std": false,
            "init_std": 0.1,
            "min_std": 0.01,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "gmm": {
            "enabled": false,
            "num_modes": 5,
            "min_std": 0.0001,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "vae": {
            "enabled": false,
            "latent_dim": 14,
            "latent_clip": null,
            "kl_weight": 1.0,
            "decoder": {
                "is_conditioned": true,
                "reconstruction_sum_across_elements": false
            },
            "prior": {
                "learn": false,
                "is_conditioned": false,
                "use_gmm": false,
                "gmm_num_modes": 10,
                "gmm_learn_weights": false,
                "use_categorical": false,
                "categorical_dim": 10,
                "categorical_gumbel_softmax_hard": false,
                "categorical_init_temp": 1.0,
                "categorical_temp_anneal_step": 0.001,
                "categorical_min_temp": 0.3
            },
            "encoder_layer_dims": [
                300,
                400
            ],
            "decoder_layer_dims": [
                300,
                400
            ],
            "prior_layer_dims": [
                300,
                400
            ]
        },
        "rnn": {
            "enabled": false,
            "horizon": 10,
            "hidden_dim": 400,
            "rnn_type": "LSTM",
            "num_layers": 2,
            "open_loop": false,
            "kwargs": {
                "bidirectional": false
            }
        },
        "transformer": {
            "enabled": false,
            "context_length": 10,
            "embed_dim": 512,
            "num_layers": 6,
            "num_heads": 8,
            "emb_dropout": 0.1,
            "attn_dropout": 0.1,
            "block_output_dropout": 0.1,
            "sinusoidal_embedding": false,
            "activation": "gelu",
            "supervise_all_steps": false,
            "nn_parameter_for_timesteps": true
        }
    },
    "observation": {
        "modalities": {
            "obs": {
                "low_dim": [

                ],
                "rgb": [
                    "frontview_image"
                ],
                "depth": [
                    "frontview_depth"
                ],
                "scan": []
            },
            "goal": {
                "low_dim": [],
                "rgb": [],
                "depth": [],
                "scan": []
            }
        },
        "encoder": {
            "low_dim": {
                "core_class": null,
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "rgb": {
                "core_class": "VisualCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "depth": {
                "core_class": "VisualCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "scan": {
                "core_class": "ScanCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            }
        }
    },
    "meta": {
        "hp_base_config_file": null,
        "hp_keys": [],
        "hp_values": []
    }
}

My train script is:

CUDA_VISIBLE_DEVICES=0 python robomimic/scripts/train.py --config robomimic/exps/templates/bc1_img.json --dataset ${my hdf5 data path}

And using rgb as input will result in the following Warning:

WARNING:imageio_ffmpeg:IMAGEIO FFMPEG_WRITER WARNING: input image is not divisible by macro_block_size=16, resizing from (3, 512) to (16, 512) to ensure video compatibility with most codecs and players. To prevent resizing, make your input image divisible by the macro_block_size or set the macro_block_size to 1 (risking incompatibility).
[swscaler @ 0x70c3880] Warning: data is not aligned! This can lead to a speed loss

When I set macro_block_size=1 or macro_block_size=None in imageio.get_writer(video_paths[k], fps=20, macro_block_size=xxx), there will be an error:

[swscaler @ 0x715ba00] Warning: data is not aligned! This can lead to a speed loss
[libx264 @ 0x713d640] width not divisible by 2 (3x512)
Error initializing output stream 0:0 -- Error while opening encoder for output stream #0:0 - maybe incorrect parameters such as bit_rate, rate, width or height

It is worth mentioning that when I use low_dim observation form as input, the generated video is observable and there is no such WARNING:imageio_ffmpeg.

Is this a bug? Or is it just me who has this issue?

Thanks!

The text was updated successfully, but these errors were encountered:

amandlek · 2024-12-04T19:31:18Z

I have not seen this issue before - what branches and versions of robomimic are you on? And how about your simulation env?

Felixvillas · 2024-12-05T02:47:23Z

I have not seen this issue before - what branches and versions of robomimic are you on? And how about your simulation env?

master, robomimic 0.3.0. An env constructed by myself.
This seems to be unrelated to the simulation environment used, right? Because the visualized video in the low_dim input mode is watchable.

Felixvillas · 2024-12-05T08:53:57Z

I find the shape of video_img=env.render(mode="rgb_array", height=512, width=512) is (512, 3).
I think the right shape should be (512, 512, 3).
I will validate this in the robosuite built in environment.

Felixvillas · 2024-12-05T11:17:37Z

UPDATE

I found it's not rgb but depth that caused this problem.
When I use the rgb and depth simultaneously in the built-in task Lift and the task construct by myself, the video is unobservable. But when I only use the rgb, the video is watchable.

Felixvillas · 2024-12-05T11:48:14Z

FIX BUG

I think i found the bug:
In:

robomimic/robomimic/envs/env_robosuite.py

Lines 197 to 199 in 9273f9c

    
           if self.use_depth_obs: 
        
               # render() returns a tuple when self.use_depth_obs=True 
        
               return im[0][::-1]

I comment this if branch and fix this bug.

Please fix this bug if you can repeat this bug.

Thank U~

@amandlek

amandlek · 2024-12-05T20:23:23Z

thanks for looking into this! what version of robosuite and mujoco are you on? it could be related to that as well. If you also have a simple minimal breaking example, we could help verify the issue and help fix this much more easily - for example a single script that constructs an environment and calls the render function, and shows the shape mismatch

Felixvillas · 2024-12-06T04:28:31Z

Robosuite Version: 1.5.0, mujoco Version: 3.2.5.
Just run the training pipline by setting depth in obs will lead this bug.

Besides, i'm trying to construct a simple minimal breaking example...

amandlek · 2024-12-07T04:13:26Z

Is it also broken on robosuite v1.4? We haven't completed thorough testing for robosuite v1.5 quite yet

Felixvillas · 2024-12-07T09:07:52Z

It works fine on robosuite v1.4.1 and does not have this bug; it appears to have occurred only in v1.5.0.

amandlek · 2024-12-11T22:51:27Z

thanks for checking! We will look into this further while integrating support for robosuite v1.5

kevin-thankyou-lin · 2024-12-12T19:39:41Z

I don't think we explicitly changed the depth rendering itself between versions - maybe one change is that the renderer defaults to mjviewer instead of mujoco? I'm not too sure how to set these in robomimic; perhaps @amandlek knows

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Visual (RGB) input leads to unobservable and invalid videos. #207

Visual (RGB) input leads to unobservable and invalid videos. #207

Felixvillas commented Dec 4, 2024

amandlek commented Dec 4, 2024

Felixvillas commented Dec 5, 2024

Felixvillas commented Dec 5, 2024

Felixvillas commented Dec 5, 2024

Felixvillas commented Dec 5, 2024 •

edited

Loading

amandlek commented Dec 5, 2024 •

edited

Loading

Felixvillas commented Dec 6, 2024 •

edited

Loading

amandlek commented Dec 7, 2024

Felixvillas commented Dec 7, 2024

amandlek commented Dec 11, 2024

kevin-thankyou-lin commented Dec 12, 2024

Visual (RGB) input leads to unobservable and invalid videos. #207

Visual (RGB) input leads to unobservable and invalid videos. #207

Comments

Felixvillas commented Dec 4, 2024

amandlek commented Dec 4, 2024

Felixvillas commented Dec 5, 2024

Felixvillas commented Dec 5, 2024

Felixvillas commented Dec 5, 2024

UPDATE

Felixvillas commented Dec 5, 2024 • edited Loading

FIX BUG

amandlek commented Dec 5, 2024 • edited Loading

Felixvillas commented Dec 6, 2024 • edited Loading

amandlek commented Dec 7, 2024

Felixvillas commented Dec 7, 2024

amandlek commented Dec 11, 2024

kevin-thankyou-lin commented Dec 12, 2024

Felixvillas commented Dec 5, 2024 •

edited

Loading

amandlek commented Dec 5, 2024 •

edited

Loading

Felixvillas commented Dec 6, 2024 •

edited

Loading