Emerge-Lab · m2kulkarni · Feb 1, 2026 · Nov 11, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -161,13 +161,16 @@ pufferlib/ocean/impulse_wars/debug-*/
 pufferlib/ocean/impulse_wars/release-*/
 pufferlib/ocean/impulse_wars/benchmark/
 
-
 # Ignore data files
 data/
-pufferlib/resources/drive/binaries/
+pufferlib/resources/drive/binaries/*
+pufferlib/resources/drive/binaries/training/
+pufferlib/resources/drive/binaries/validation/
 
 # But keep map_000.bin for the training test
 !pufferlib/resources/drive/binaries/map_000.bin
+!pufferlib/resources/drive/binaries/training/map_000.bin
+pufferlib/resources/drive/sanity/sanity_binaries/
 
 # Compiled drive binary in root
 /drive
@@ -183,6 +186,10 @@ pufferlib/resources/drive/output_agent.gif pufferlib/resources/drive/output.gif
 # Local artifacts and outputs
 artifacts/
 # Local drive renders
+pufferlib/resources/drive/output*.gif
+emsdk/
+docs/book/*
+!docs/book/assets/
 pufferlib/resources/drive/output*.mp4
 
 # Local TODO tracking

diff --git a/external/pyxodr b/external/pyxodr
diff --git a/pufferlib/config/ocean/adaptive.ini b/pufferlib/config/ocean/adaptive.ini
@@ -25,36 +25,38 @@ num_ego_agents = 512
 action_type = discrete
 ; Options: classic, jerk
 dynamics_model = classic
-; Number of consecutive scenarios per episode (adaptive-specific)
-k_scenarios = 2
 reward_vehicle_collision = -0.5
-reward_offroad_collision = -0.2
-reward_ade = 0.0
+reward_offroad_collision = -0.5
 dt = 0.1
 reward_goal = 1.0
-reward_goal_post_respawn = 0.25
+reward_goal_post_respawn = 0.25 # in case of reward conditioning, we scale the goal_weight by this number for post respawn
 ; Meters around goal to be considered "reached"
 goal_radius = 2.0
-; What to do when goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
+; Max target speed in m/s for the agent to maintain towards the goal
+goal_speed = 100.0
+; What to do when the goal is reached. Options: 0:"respawn", 1:"generate_new_goals", 2:"stop"
 goal_behavior = 0
+; Determines the target distance to the new goal in the case of goal_behavior = generate_new_goals.
+; Large numbers will select a goal point further away from the agent's current position.
+goal_target_distance = 30.0
 ; Options: 0 - Ignore, 1 - Stop, 2 - Remove
 collision_behavior = 0
 ; Options: 0 - Ignore, 1 - Stop, 2 - Remove
 offroad_behavior = 0
-; Number of steps before reset
+; Number of steps before
 scenario_length = 91
-; Resample frequency = k_scenarios * scenario_length (adaptive-specific)
-resample_frequency = 182
-num_maps = 1000
-; Which step of the trajectory to initialize the agents at upon reset
+k_scenarios = 2
+termination_mode = 1 # 0 - terminate at episode_length, 1 - terminate after all agents have been reset
+map_dir = "resources/drive/binaries/training"
+num_maps = 10000
+; Determines which step of the trajectory to initialize the agents at upon reset
 init_steps = 0
-; Options: "control_vehicles", "control_agents", "control_tracks_to_predict"
+; Options: "control_vehicles", "control_agents", "control_wosac", "control_sdc_only"
 control_mode = "control_vehicles"
 ; Options: "created_all_valid", "create_only_controlled"
 init_mode = "create_all_valid"
 ; train with co players
-co_player_enabled = 1
-
+co_player_enabled = True
 
 [env.conditioning]
 ; Options: "none", "reward", "entropy", "discount", "all"
@@ -71,10 +73,9 @@ discount_weight_lb = 0.80
 discount_weight_ub = 0.98
 
 [env.co_player_policy]
-enabled = True
 policy_name = Drive
 rnn_name = Recurrent
-policy_path = "experiments/puffer_drive_ewdjljwd.pt"
+policy_path = "pufferlib/resources/drive/policies/varied_discount.pt"
 input_size = 64
 hidden_size = 256
 
@@ -87,39 +88,35 @@ hidden_size = 256
 type = "all"
 collision_weight_lb = -1.0
 collision_weight_ub = 0.0
-offroad_weight_lb = -0.4
-offroad_weight_ub = 0.0
+offroad_weight_lb = 0.0
+offroad_weight_ub = -0.2
 goal_weight_lb = 0.0
 goal_weight_ub = 1.0
 entropy_weight_lb = 0.0
 entropy_weight_ub = 0.001
-discount_weight_lb = 0.80
-discount_weight_ub = 0.98
-
+discount_weight_lb = 0.98
+discount_weight_ub = 0.80
 
 [train]
+seed=42
 total_timesteps = 2_000_000_000
 # learning_rate = 0.02
 # gamma = 0.985
 anneal_lr = True
 ; Needs to be: num_agents * num_workers * BPTT horizon
 batch_size = auto
-; minibatch_size = 745472
-; minibatch_multiplier = 512
-; max_minibatch_size = 745472
-minibatch_size = 372736
-minibatch_multiplier = 256
-max_minibatch_size = 372736
-; BPTT horizon (overridden by pufferl.py for adaptive agents to k_scenarios * scenario_length)
-bptt_horizon = 32
+minibatch_size = 36400
+max_minibatch_size = 36400
+minibatch_multiplier = 400
+bptt_horizon = 91
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-8
 clip_coef = 0.2
-ent_coef = 0.001
+ent_coef = 0.005
 gae_lambda = 0.95
 gamma = 0.98
-learning_rate = 0.001
+learning_rate = 0.003
 max_grad_norm = 1
 prio_alpha = 0.8499999999999999
 prio_beta0 = 0.8499999999999999
@@ -128,36 +125,40 @@ vf_clip_coef = 0.1999999999999999
 vf_coef = 2
 vtrace_c_clip = 1
 vtrace_rho_clip = 1
-checkpoint_interval = 1000
+checkpoint_interval = 100
 # Rendering options
 render = True
-render_interval = 1000
+render_interval = 100
 ; If True, show exactly what the agent sees in agent observation
 obs_only = True
 ; Show grid lines
-show_grid = False
+show_grid = True
 ; Draws lines from ego agent observed ORUs and road elements to show detection range
 show_lasers = False
 ; Display human xy logs in the background
-show_human_logs = True
-; Options: str to path (e.g., "resources/drive/binaries/map_001.bin"), None
+show_human_logs = False
+; If True, zoom in on a part of the map. Otherwise, show full map
+zoom_in = True
+; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
 render_map = none
 
 [eval]
 eval_interval = 1000
+; Path to dataset used for evaluation
+map_dir = "resources/drive/binaries/training"
+; Evaluation will run on the first num_maps maps in the map_dir directory
+num_maps = 20
 backend = PufferEnv
-# WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
+; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
 ; If True, enables evaluation on realism metrics each time we save a checkpoint
-wosac_realism_eval = True
+wosac_realism_eval = False
 ; Number of policy rollouts per scene
 wosac_num_rollouts = 32
 ; When to start the simulation
 wosac_init_steps = 10
-; Total number of WOSAC agents to evaluate
-wosac_num_agents = 256
-; Control the tracks to predict
-wosac_control_mode = "control_tracks_to_predict"
-; Initialize from the tracks to predict
+; Control everything valid at init in the scene
+wosac_control_mode = "control_wosac"
+; Create everything in valid at init the scene
 wosac_init_mode = "create_all_valid"
 ; Stop when reaching the goal
 wosac_goal_behavior = 2
@@ -167,24 +168,22 @@ wosac_sanity_check = False
 ; Only return aggregate results across all scenes
 wosac_aggregate_results = True
 ; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = True
+human_replay_eval = False
 ; Control only the self-driving car
 human_replay_control_mode = "control_sdc_only"
-; This equals the number of scenarios, since we control one agent in each
-human_replay_num_agents = 64
 
-[sweep.env.reward_vehicle_collision]
-distribution = uniform
-min = -0.5
-max = 0.0
-mean = -0.05
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.001
+mean = 0.003
+max = 0.005
 scale = auto
 
-[sweep.env.reward_offroad_collision]
-distribution = uniform
-min = -0.5
-max = 0.0
-mean = -0.05
+[sweep.train.ent_coef]
+distribution = log_normal
+min = 0.001
+mean = 0.005
+max = 0.03
 scale = auto
 
 [sweep.env.goal_radius]
@@ -194,16 +193,18 @@ max = 20.0
 mean = 10.0
 scale = auto
 
-[sweep.env.reward_ade]
-distribution = uniform
-min = -0.1
-max = 0.0
-mean = -0.02
+[sweep.train.gae_lambda]
+distribution = log_normal
+min = 0.95
+mean = 0.98
+max = 0.999
 scale = auto
 
-[sweep.env.reward_goal_post_respawn]
-distribution = uniform
-min = 0.0
-max = 1.0
-mean = 0.5
-scale = auto
+[controlled_exp.train.goal_speed]
+values = [10, 20, 30, 3]
+
+[controlled_exp.train.ent_coef]
+values = [0.001, 0.005, 0.01]
+
+[controlled_exp.train.seed]
+values = [42, 55, 1]