Visual (RGB) input leads to unobservable and invalid videos.

Visual input leads to unobservable and invalid videos.
When I train the model with BC, the generated video is unwatchable when using RGB-form input.
Here is my `bc.json`.
```json
{
    "algo_name": "bc",
    "experiment": {
        "name": "test",
        "validate": false,
        "logging": {
            "terminal_output_to_txt": true,
            "log_tb": true,
            "log_wandb": false,
            "wandb_proj_name": "debug"
        },
        "save": {
            "enabled": true,
            "every_n_seconds": null,
            "every_n_epochs": 50,
            "epochs": [],
            "on_best_validation": false,
            "on_best_rollout_return": false,
            "on_best_rollout_success_rate": true
        },
        "epoch_every_n_steps": 100,
        "validation_epoch_every_n_steps": 10,
        "env": null,
        "additional_envs": null,
        "render": false,
        "render_video": true,
        "keep_all_videos": false,
        "video_skip": 1,
        "rollout": {
            "enabled": true,
            "n": 50,
            "horizon": 400,
            "rate": 50,
            "warmstart": 0,
            "terminate_on_success": true
        }
    },
    "train": {
        "data": null,
        "output_dir": "../bc_trained_models",
        "num_data_workers": 0,
        "hdf5_cache_mode": "all",
        "hdf5_use_swmr": true,
        "hdf5_load_next_obs": false,
        "hdf5_normalize_obs": false,
        "hdf5_filter_key": null,
        "hdf5_validation_filter_key": null,
        "seq_length": 1,
        "pad_seq_length": true,
        "frame_stack": 1,
        "pad_frame_stack": true,
        "dataset_keys": [
            "actions",
            "rewards",
            "dones"
        ],
        "goal_mode": null,
        "cuda": true,
        "batch_size": 100,
        "num_epochs": 4000,
        "seed": 1
    },
    "algo": {
        "optim_params": {
            "policy": {
                "optimizer_type": "adam",
                "learning_rate": {
                    "initial": 0.0001,
                    "decay_factor": 0.1,
                    "epoch_schedule": [],
                    "scheduler_type": "multistep"
                },
                "regularization": {
                    "L2": 0.0
                }
            }
        },
        "loss": {
            "l2_weight": 1.0,
            "l1_weight": 0.0,
            "cos_weight": 0.0
        },
        "actor_layer_dims": [
            1024,
            1024
        ],
        "gaussian": {
            "enabled": false,
            "fixed_std": false,
            "init_std": 0.1,
            "min_std": 0.01,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "gmm": {
            "enabled": false,
            "num_modes": 5,
            "min_std": 0.0001,
            "std_activation": "softplus",
            "low_noise_eval": true
        },
        "vae": {
            "enabled": false,
            "latent_dim": 14,
            "latent_clip": null,
            "kl_weight": 1.0,
            "decoder": {
                "is_conditioned": true,
                "reconstruction_sum_across_elements": false
            },
            "prior": {
                "learn": false,
                "is_conditioned": false,
                "use_gmm": false,
                "gmm_num_modes": 10,
                "gmm_learn_weights": false,
                "use_categorical": false,
                "categorical_dim": 10,
                "categorical_gumbel_softmax_hard": false,
                "categorical_init_temp": 1.0,
                "categorical_temp_anneal_step": 0.001,
                "categorical_min_temp": 0.3
            },
            "encoder_layer_dims": [
                300,
                400
            ],
            "decoder_layer_dims": [
                300,
                400
            ],
            "prior_layer_dims": [
                300,
                400
            ]
        },
        "rnn": {
            "enabled": false,
            "horizon": 10,
            "hidden_dim": 400,
            "rnn_type": "LSTM",
            "num_layers": 2,
            "open_loop": false,
            "kwargs": {
                "bidirectional": false
            }
        },
        "transformer": {
            "enabled": false,
            "context_length": 10,
            "embed_dim": 512,
            "num_layers": 6,
            "num_heads": 8,
            "emb_dropout": 0.1,
            "attn_dropout": 0.1,
            "block_output_dropout": 0.1,
            "sinusoidal_embedding": false,
            "activation": "gelu",
            "supervise_all_steps": false,
            "nn_parameter_for_timesteps": true
        }
    },
    "observation": {
        "modalities": {
            "obs": {
                "low_dim": [

                ],
                "rgb": [
                    "frontview_image"
                ],
                "depth": [
                    "frontview_depth"
                ],
                "scan": []
            },
            "goal": {
                "low_dim": [],
                "rgb": [],
                "depth": [],
                "scan": []
            }
        },
        "encoder": {
            "low_dim": {
                "core_class": null,
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "rgb": {
                "core_class": "VisualCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "depth": {
                "core_class": "VisualCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            },
            "scan": {
                "core_class": "ScanCore",
                "core_kwargs": {},
                "obs_randomizer_class": null,
                "obs_randomizer_kwargs": {}
            }
        }
    },
    "meta": {
        "hp_base_config_file": null,
        "hp_keys": [],
        "hp_values": []
    }
}
```

My train script is:
```bash
CUDA_VISIBLE_DEVICES=0 python robomimic/scripts/train.py --config robomimic/exps/templates/bc1_img.json --dataset ${my hdf5 data path}
```
And using `rgb` as input will result in the following `Warning`:
```bash
WARNING:imageio_ffmpeg:IMAGEIO FFMPEG_WRITER WARNING: input image is not divisible by macro_block_size=16, resizing from (3, 512) to (16, 512) to ensure video compatibility with most codecs and players. To prevent resizing, make your input image divisible by the macro_block_size or set the macro_block_size to 1 (risking incompatibility).
[swscaler @ 0x70c3880] Warning: data is not aligned! This can lead to a speed loss
```
When I set `macro_block_size=1 or macro_block_size=None` in `imageio.get_writer(video_paths[k], fps=20, macro_block_size=xxx)`, there will be an error:
```bash
[swscaler @ 0x715ba00] Warning: data is not aligned! This can lead to a speed loss
[libx264 @ 0x713d640] width not divisible by 2 (3x512)
Error initializing output stream 0:0 -- Error while opening encoder for output stream #0:0 - maybe incorrect parameters such as bit_rate, rate, width or height
```
It is worth mentioning that when I use `low_dim` observation form as input, the generated video is observable and there is no such `WARNING:imageio_ffmpeg`.

Is this a bug? Or is it just me who has this issue?

Thanks!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Visual (RGB) input leads to unobservable and invalid videos. #207

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Visual (RGB) input leads to unobservable and invalid videos. #207

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions