From 268e00291979bee178b509a1c5d697733b24a958 Mon Sep 17 00:00:00 2001 From: leo-chen <1483949750@qq.com> Date: Fri, 5 Dec 2025 15:47:38 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E5=A4=9A=E5=8D=A1=E5=88=86=E5=B8=83?= =?UTF-8?q?=E5=BC=8F=E8=AE=AD=E7=BB=83=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/persformer_openlane.py | 4 ++-- requirements-new.txt | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 requirements-new.txt diff --git a/config/persformer_openlane.py b/config/persformer_openlane.py index 642d21d..43378ab 100644 --- a/config/persformer_openlane.py +++ b/config/persformer_openlane.py @@ -27,8 +27,8 @@ def config(args): # 1000 sequence args.dataset_name = 'openlane' - args.dataset_dir = '/mnt/disk01/openlane/images/' - args.data_dir = '/mnt/disk01/openlane/lane3d_1000/' + args.dataset_dir = '/root/autodl-tmp/dataset/openlane-v1.0/images/' + args.data_dir = '/root/autodl-tmp/dataset/openlane-v1.0/lane3d_1000/' if 'openlane' in args.dataset_name: openlane_config(args) diff --git a/requirements-new.txt b/requirements-new.txt new file mode 100644 index 0000000..fea1142 --- /dev/null +++ b/requirements-new.txt @@ -0,0 +1,17 @@ +opencv-python==4.5.5.64 +geffnet==1.0.2 +Shapely==1.8.5 +scipy==1.10.1 +lxml==4.9.3 +xmljson==0.2.0 +ujson==1.35 +PyYAML==5.4.1 +scikit-learn==1.0.2 +matplotlib==3.5.1 +tqdm==4.64.1 +tensorboard==2.12.2 +tensorboardX==2.5 +gdown==4.4.0 +p_tqdm==1.3.3 +ortools==9.3.10497 +thop==0.1.1.post2209072238 \ No newline at end of file From 3405e7f83c1771ddf405994ebd34403931f889a7 Mon Sep 17 00:00:00 2001 From: leo-chen <1483949750@qq.com> Date: Wed, 17 Dec 2025 17:32:19 +0800 Subject: [PATCH 2/4] chore: minor updates on 2025-12-17 --- bookmarks.json | 91 +++++++++++++++++++++++++++++++++++++++++++ data/Load_Data.py | 44 ++++++++++++++------- experiments/ddp.py | 8 +++- experiments/runner.py | 11 ++++-- utils/utils.py | 11 +++--- 5 files changed, 142 insertions(+), 23 deletions(-) create mode 100644 bookmarks.json diff --git a/bookmarks.json b/bookmarks.json new file mode 100644 index 0000000..05e418d --- /dev/null +++ b/bookmarks.json @@ -0,0 +1,91 @@ +{ + "version": "3.0.1", + "lastModified": "2025-12-17T06:25:22.492Z", + "folders": [], + "bookmarks": [ + { + "id": "mj113126drhj06q881n", + "name": "Re-calculate extrinsic matrix based on ground coordinate", + "file": "data/Load_Data.py", + "line": 557, + "character": 16, + "lineContent": "gt_cam_height = cam_extrinsics[2, 3]", + "sortOrder": 2, + "color": "orange" + }, + { + "id": "mj11isucznhzdm43lq", + "name": "Coordinate convertion for openlane_300 data", + "file": "data/Load_Data.py", + "line": 584, + "character": 16, + "lineContent": "lane = np.matmul(cam_extrinsics, np.matmul(cam_representation, lane))", + "sortOrder": 3, + "color": "orange" + }, + { + "id": "mj11v2knq99epobgbcr", + "name": "convert 3d lanes to flat ground space", + "file": "data/Load_Data.py", + "line": 634, + "character": 8, + "lineContent": "self.convert_lanes_3d_to_gflat(gt_lanes, P_g2gflat)", + "sortOrder": 4, + "color": "orange" + }, + { + "id": "mj6hadqnfgbwyfseb0r", + "name": "class Runner:", + "file": "experiments/runner.py", + "line": 43, + "character": 0, + "lineContent": "class Runner:", + "sortOrder": 0 + }, + { + "id": "mj6harrsdxvm3dmuyje", + "name": "def train(self):", + "file": "experiments/runner.py", + "line": 106, + "character": 4, + "lineContent": "def train(self):", + "sortOrder": 1 + }, + { + "id": "mj9fg5e8ip7goi8o6bq", + "name": "if not args.no_tb and args.proc_id == 0:", + "file": "experiments/runner.py", + "line": 147, + "character": 8, + "lineContent": "if not args.no_tb and args.proc_id == 0:", + "sortOrder": 6 + }, + { + "id": "mj9g25gd8p16a05cmsl", + "name": "def WIP__getitem__(self, idx):", + "file": "data/Load_Data.py", + "line": 707, + "character": 4, + "lineContent": "def WIP__getitem__(self, idx):", + "sortOrder": 7 + }, + { + "id": "mj9mgdzc68ii017taws", + "name": "label_list = glob.glob(json_file_path + '**/*.json', recursive=True)", + "file": "data/Load_Data.py", + "line": 1214, + "character": 8, + "lineContent": "label_list = glob.glob(json_file_path + '**/*.json', recursive=True)", + "sortOrder": 8 + }, + { + "id": "mj9mj7jgurjuhpsx1kg", + "name": "train_dataset = LaneDataset(args.dataset_dir, args.data_dir + 'training/', args, data_aug=True, save_std=True, seg_bev=args.seg_bev)", + "file": "experiments/runner.py", + "line": 689, + "character": 12, + "lineContent": "train_dataset = LaneDataset(args.dataset_dir, args.data_dir + 'training/', args, data_aug=True, save_std=True, seg_bev=args.seg_bev)", + "sortOrder": 5 + } + ] +} \ No newline at end of file diff --git a/data/Load_Data.py b/data/Load_Data.py index e45636a..0fdc3b9 100644 --- a/data/Load_Data.py +++ b/data/Load_Data.py @@ -42,14 +42,14 @@ class LaneDataset(Dataset): """ - Dataset with labeled lanes - This implementation considers: - w/o laneline 3D attributes - w/o centerline annotations - default considers 3D laneline, including centerlines - - This new version of data loader prepare ground-truth anchor tensor in flat ground space. - It is assumed the dataset provides accurate visibility labels. Preparing ground-truth tensor depends on it. + 带标注车道线的数据集 + 本实现考虑: + 不含车道线的3D属性 + 不含中心线标注 + 默认考虑3D车道线,包括中心线 + + 这个新版本的数据加载器在平地空间中准备真实锚点张量。 + 假设数据集提供了准确的可见性标签。准备真实张量依赖于这些标签。 """ # dataset_base_dir is image path, json_file_path is json file path, def __init__(self, dataset_base_dir, json_file_path, args, data_aug=False, save_std=False, seg_bev=False): @@ -97,7 +97,7 @@ def __init__(self, dataset_base_dir, json_file_path, args, data_aug=False, save_ self.K = args.K self.H_crop = homography_crop_resize([args.org_h, args.org_w], args.crop_y, [args.resize_h, args.resize_w]) - # transformation from ipm to ground region + # 从 IPM 到地面区域的转换 self.H_ipm2g = cv2.getPerspectiveTransform(np.float32([[0, 0], [self.ipm_w-1, 0], [0, self.ipm_h-1], @@ -263,7 +263,7 @@ def __init__(self, dataset_base_dir, json_file_path, args, data_aug=False, save_ self.save_json_path = args.save_json_path - # parse ground-truth file + # 解析真实数据文件 if 'openlane' in self.dataset_name: self._x_off_std, \ self._y_off_std, \ @@ -529,6 +529,7 @@ def preprocess_data_from_json_openlane(self, idx_json_file): _laneline_ass_id = None with open(idx_json_file, 'r') as file: + # idx_json_file:具体的JSON文件 file_lines = [line for line in file] info_dict = json.loads(file_lines[0]) @@ -539,12 +540,15 @@ def preprocess_data_from_json_openlane(self, idx_json_file): if not self.fix_cam: cam_extrinsics = np.array(info_dict['extrinsic']) # Re-calculate extrinsic matrix based on ground coordinate + # 绕Z轴旋转-90°的矩阵 R_vg = np.array([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], dtype=float) + # 绕X轴旋转-90°的矩阵 R_gc = np.array([[1, 0, 0], [0, 0, 1], [0, -1, 0]], dtype=float) + # 坐标系的相似变换加上一个坐标轴重新定向 cam_extrinsics[:3, :3] = np.matmul(np.matmul( np.matmul(np.linalg.inv(R_vg), cam_extrinsics[:3, :3]), R_vg), R_gc) @@ -791,6 +795,13 @@ def WIP__getitem__(self, idx): image = F.crop(image, self.h_crop, 0, self.h_org-self.h_crop, self.w_org) image = F.resize(image, size=(self.h_net, self.w_net), interpolation=InterpolationMode.BILINEAR) + # Debug:Visualize resize image + # import matplotlib.pyplot as plt + # import numpy as np + # plt.imshow(np.array(image)) + # plt.title(f'缩放后: {image.size}') + # plt.show() + gt_anchor = np.zeros([self.anchor_num, self.num_types, self.anchor_dim], dtype=np.float32) gt_anchor[:, :, self.anchor_dim - self.num_category] = 1.0 gt_lanes = _label_laneline @@ -1193,12 +1204,15 @@ def read_cache_file_beta(self, cache_file): def init_dataset_openlane_beta(self, dataset_base_dir, json_file_path): """ :param dataset_info_file: - :return: image paths, labels in unormalized net input coordinates + :return: 图像路径,标签在未归一化的网络输入坐标中 - data processing: - ground truth labels map are scaled wrt network input sizes + 数据处理: + 真值标签映射相对于网络输入尺寸进行缩放 """ + # args.dataset_dir = '/root/autodl-tmp/dataset/openlane-v1.0/images/' + # args.data_dir = '/root/autodl-tmp/dataset/openlane-v1.0/lane3d_1000/training/' + # 返回所有匹配文件完整路径的列表,赋值给 label_list 变量 label_list = glob.glob(json_file_path + '**/*.json', recursive=True) # save label list and this determine the idx order @@ -1277,6 +1291,8 @@ def init_dataset_openlane_beta(self, dataset_base_dir, json_file_path): file_lines = [line for line in file] info_dict = json.loads(file_lines[0]) + # args.dataset_dir = '/root/autodl-tmp/dataset/openlane-v1.0/images/' + # file_path "training/segment-15832924468527961_1564_160_1584_160_with_camera_labels/150767882687643500.jpg" image_path = ops.join(dataset_base_dir, info_dict['file_path']) assert ops.exists(image_path), '{:s} not exist'.format(image_path) @@ -2499,7 +2515,7 @@ def seed_worker(worker_id): def get_loader(transformed_dataset, args): """ create dataset from ground-truth - return a batch sampler based ont the dataset + return a batch sampler based on the dataset """ # transformed_dataset = LaneDataset(dataset_base_dir, json_file_path, args) diff --git a/experiments/ddp.py b/experiments/ddp.py index d403348..148dfe4 100644 --- a/experiments/ddp.py +++ b/experiments/ddp.py @@ -23,8 +23,14 @@ import random def setup_dist_launch(args): + # 通过launch.json 文件中定义的PyTorch分布式启动器动态获取进程ID args.proc_id = args.local_rank - world_size = int(os.getenv('WORLD_SIZE', 1))*args.nodes + # 包含隐形错误,pytorch分布式launch模块已为WORLD_SIZE分配值为总进程数 = args.nnodes * args.nproc_per_node + + + # 通过launch.json 文件中定义的PyTorch分布式启动器获取总进程数 + # world_size = int(os.getenv('WORLD_SIZE', 1))*args.nodes + world_size = int(os.environ['WORLD_SIZE']) print("proc_id: " + str(args.proc_id)) print("world size: " + str(world_size)) print("local_rank: " + str(args.local_rank)) diff --git a/experiments/runner.py b/experiments/runner.py index 8b6a6d7..ee0d51e 100644 --- a/experiments/runner.py +++ b/experiments/runner.py @@ -48,13 +48,13 @@ def __init__(self, args): if args.proc_id == 0: if not args.no_cuda and not torch.cuda.is_available(): raise Exception("No gpu available for usage") - if int(os.getenv('WORLD_SIZE', 1)) >= 1: + if int(os.getenv('WORLD_SIZE', 1)) >= 1: # WORLD_SIZE = 参与训练的总进程数。单机多卡时,进程数等于GPU数量 print("Let's use", os.environ['WORLD_SIZE'], "GPUs!") torch.cuda.empty_cache() - save_id = args.mod + save_id = args.mod # 训练模型名称 args.save_json_path = args.save_path - args.save_path = os.path.join(args.save_path, save_id) + args.save_path = os.path.join(args.save_path, save_id) # save_path 位置:data_splits/openlane/PersFormer/ if args.proc_id == 0: mkdir_if_missing(args.save_path) mkdir_if_missing(os.path.join(args.save_path, 'example/')) @@ -72,7 +72,10 @@ def __init__(self, args): self.valid_set_labels = [json.loads(line) for line in open(self.val_gt_file).readlines()] # self.crit_string = 'loss_gflat' + # crit_string损失准则:动态选择损失函数实例 + # criterion:损失函数对象 self.crit_string = args.crit_string + # Define loss criteria if self.crit_string == 'loss_gflat_3D': self.criterion = Loss_crit.Laneline_loss_gflat_3D(args.batch_size, self.train_dataset.num_types, @@ -682,6 +685,8 @@ def eval(self): def _get_train_dataset(self): args = self.args if 'openlane' in args.dataset_name: + # args.dataset_dir = '/root/autodl-tmp/dataset/openlane-v1.0/images/' + # args.data_dir = '/root/autodl-tmp/dataset/openlane-v1.0/lane3d_1000/' train_dataset = LaneDataset(args.dataset_dir, args.data_dir + 'training/', args, data_aug=True, save_std=True, seg_bev=args.seg_bev) elif 'once' in args.dataset_name: diff --git a/utils/utils.py b/utils/utils.py index 3152940..cae095d 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -739,11 +739,11 @@ def prune_3d_lane_by_range(lane_3d, x_min, x_max): def resample_laneline_in_y(input_lane, y_steps, out_vis=False): """ - Interpolate x, z values at each anchor grid, including those beyond the range of input lnae y range - :param input_lane: N x 2 or N x 3 ndarray, one row for a point (x, y, z-optional). - It requires y values of input lane in ascending order - :param y_steps: a vector of steps in y - :param out_vis: whether to output visibility indicator which only depends on input y range + 在每个锚点网格上插值 x、z 值,包括输入车道 y 范围之外的值 + :param input_lane: N x 2 或 N x 3 ndarray,每行表示一个点 (x, y, z-可选)。 + 要求输入车道的 y 值按升序排列 + :param y_steps: y 方向的步长向量 + :param out_vis: 是否输出可见性标志,该标志仅依赖输入 y 范围 :return: """ @@ -756,6 +756,7 @@ def resample_laneline_in_y(input_lane, y_steps, out_vis=False): if input_lane.shape[1] < 3: input_lane = np.concatenate([input_lane, np.zeros([input_lane.shape[0], 1], dtype=np.float32)], axis=1) + # TODO 插值方法是否可以改变?scipy.interpolate.interp1d默认是linear f_x = interp1d(input_lane[:, 1], input_lane[:, 0], fill_value="extrapolate") f_z = interp1d(input_lane[:, 1], input_lane[:, 2], fill_value="extrapolate") From 2836372aebb5c10dda01cbf6a87695e734cdd5c6 Mon Sep 17 00:00:00 2001 From: leo-chen <1483949750@qq.com> Date: Tue, 6 Jan 2026 09:47:34 +0800 Subject: [PATCH 3/4] Enable single-machine training mode --- bookmarks.json | 57 ++++++++++++++++++++++++++++--------------- data/Load_Data.py | 12 ++++----- experiments/runner.py | 2 ++ utils/utils.py | 12 ++++----- 4 files changed, 52 insertions(+), 31 deletions(-) diff --git a/bookmarks.json b/bookmarks.json index 05e418d..3120c06 100644 --- a/bookmarks.json +++ b/bookmarks.json @@ -1,6 +1,6 @@ { - "version": "3.0.1", - "lastModified": "2025-12-17T06:25:22.492Z", + "version": "3.0.2", + "lastModified": "2026-01-06T01:25:23.484Z", "folders": [], "bookmarks": [ { @@ -17,7 +17,7 @@ "id": "mj11isucznhzdm43lq", "name": "Coordinate convertion for openlane_300 data", "file": "data/Load_Data.py", - "line": 584, + "line": 587, "character": 16, "lineContent": "lane = np.matmul(cam_extrinsics, np.matmul(cam_representation, lane))", "sortOrder": 3, @@ -27,7 +27,7 @@ "id": "mj11v2knq99epobgbcr", "name": "convert 3d lanes to flat ground space", "file": "data/Load_Data.py", - "line": 634, + "line": 637, "character": 8, "lineContent": "self.convert_lanes_3d_to_gflat(gt_lanes, P_g2gflat)", "sortOrder": 4, @@ -46,7 +46,7 @@ "id": "mj6harrsdxvm3dmuyje", "name": "def train(self):", "file": "experiments/runner.py", - "line": 106, + "line": 108, "character": 4, "lineContent": "def train(self):", "sortOrder": 1 @@ -55,37 +55,56 @@ "id": "mj9fg5e8ip7goi8o6bq", "name": "if not args.no_tb and args.proc_id == 0:", "file": "experiments/runner.py", - "line": 147, + "line": 149, "character": 8, "lineContent": "if not args.no_tb and args.proc_id == 0:", - "sortOrder": 6 + "sortOrder": 7 }, { "id": "mj9g25gd8p16a05cmsl", "name": "def WIP__getitem__(self, idx):", "file": "data/Load_Data.py", - "line": 707, + "line": 529, "character": 4, "lineContent": "def WIP__getitem__(self, idx):", - "sortOrder": 7 - }, - { - "id": "mj9mgdzc68ii017taws", - "name": "label_list = glob.glob(json_file_path + '**/*.json', recursive=True)", - "file": "data/Load_Data.py", - "line": 1214, - "character": 8, - "lineContent": "label_list = glob.glob(json_file_path + '**/*.json', recursive=True)", "sortOrder": 8 }, { "id": "mj9mj7jgurjuhpsx1kg", "name": "train_dataset = LaneDataset(args.dataset_dir, args.data_dir + 'training/', args, data_aug=True, save_std=True, seg_bev=args.seg_bev)", "file": "experiments/runner.py", - "line": 689, + "line": 691, "character": 12, "lineContent": "train_dataset = LaneDataset(args.dataset_dir, args.data_dir + 'training/', args, data_aug=True, save_std=True, seg_bev=args.seg_bev)", - "sortOrder": 5 + "sortOrder": 6 + }, + { + "id": "mjb74zjwojlzastp8na", + "name": "def init_dataset_openlane_beta(self, dataset_base_dir, json_file_path):", + "file": "data/Load_Data.py", + "line": 1203, + "character": 4, + "lineContent": "def init_dataset_openlane_beta(self, dataset_base_dir, json_file_path):", + "sortOrder": 9, + "color": "green" + }, + { + "id": "mjem5vcvvnxgvphy8v", + "name": "x_off_values = x_values - self.anchor_grid_x # offset compared with all anchors instead of closest one", + "file": "data/Load_Data.py", + "line": 2211, + "character": 16, + "lineContent": "x_off_values = x_values - self.anchor_grid_x # offset compared with all anchors instead of closest one", + "sortOrder": 12 + }, + { + "id": "mjem7osmwbkw3lpimhq", + "name": "if not self.new_match:", + "file": "data/Load_Data.py", + "line": 603, + "character": 12, + "lineContent": "if not self.new_match:", + "sortOrder": 13 } ] } \ No newline at end of file diff --git a/data/Load_Data.py b/data/Load_Data.py index 0fdc3b9..cdfe933 100644 --- a/data/Load_Data.py +++ b/data/Load_Data.py @@ -704,7 +704,7 @@ def __len__(self): """ return self.n_samples - # new getitem, WIP + # 新的getitem,进行中 def WIP__getitem__(self, idx): """ Args: idx (int): Index in list to load image @@ -796,11 +796,11 @@ def WIP__getitem__(self, idx): image = F.resize(image, size=(self.h_net, self.w_net), interpolation=InterpolationMode.BILINEAR) # Debug:Visualize resize image - # import matplotlib.pyplot as plt - # import numpy as np - # plt.imshow(np.array(image)) - # plt.title(f'缩放后: {image.size}') - # plt.show() + import matplotlib.pyplot as plt + import numpy as np + plt.imshow(np.array(image)) + plt.title(f'缩放后: {image.size}') + plt.show() gt_anchor = np.zeros([self.anchor_num, self.num_types, self.anchor_dim], dtype=np.float32) gt_anchor[:, :, self.anchor_dim - self.num_category] = 1.0 diff --git a/experiments/runner.py b/experiments/runner.py index ee0d51e..2a516b8 100644 --- a/experiments/runner.py +++ b/experiments/runner.py @@ -53,6 +53,8 @@ def __init__(self, args): torch.cuda.empty_cache() save_id = args.mod # 训练模型名称 + + # /root/autodl-tmp/projects/PersFormer_3DLane/data_splits/openlane args.save_json_path = args.save_path args.save_path = os.path.join(args.save_path, save_id) # save_path 位置:data_splits/openlane/PersFormer/ if args.proc_id == 0: diff --git a/utils/utils.py b/utils/utils.py index cae095d..9d91da8 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -917,13 +917,13 @@ def homographic_transformation(Matrix, x, y): def projective_transformation(Matrix, x, y, z): """ - Helper function to transform coordinates defined by transformation matrix + 用于通过变换矩阵转换坐标的辅助函数 - Args: - Matrix (multi dim - array): 3x4 projection matrix - x (array): original x coordinates - y (array): original y coordinates - z (array): original z coordinates +参数: + Matrix (多维数组):3x4 投影矩阵 + x (数组):原始 x 坐标 + y (数组):原始 y 坐标 + z (数组):原始 z 坐标 """ ones = np.ones((1, len(z))) coordinates = np.vstack((x, y, z, ones)) From 153ee9f9ed2e4d2a90bca5c1f596aebe7437a235 Mon Sep 17 00:00:00 2001 From: leo-chen <1483949750@qq.com> Date: Sun, 18 Jan 2026 16:05:13 +0800 Subject: [PATCH 4/4] Debug for single_gpu --- bookmarks.json | 4 +- config/persformer_openlane.py | 3 +- data/Load_Data.py | 6 +- experiments/ddp.py | 6 +- experiments/runner.py | 161 +++++++++++++++++++++++++--------- experiments/single_gpu.py | 38 ++++++++ main_persformer.py | 7 +- utils/utils.py | 2 +- 8 files changed, 175 insertions(+), 52 deletions(-) create mode 100644 experiments/single_gpu.py diff --git a/bookmarks.json b/bookmarks.json index 3120c06..ea105ca 100644 --- a/bookmarks.json +++ b/bookmarks.json @@ -1,6 +1,6 @@ { "version": "3.0.2", - "lastModified": "2026-01-06T01:25:23.484Z", + "lastModified": "2026-01-14T08:17:18.734Z", "folders": [], "bookmarks": [ { @@ -101,7 +101,7 @@ "id": "mjem7osmwbkw3lpimhq", "name": "if not self.new_match:", "file": "data/Load_Data.py", - "line": 603, + "line": 588, "character": 12, "lineContent": "if not self.new_match:", "sortOrder": 13 diff --git a/config/persformer_openlane.py b/config/persformer_openlane.py index 43378ab..4454263 100644 --- a/config/persformer_openlane.py +++ b/config/persformer_openlane.py @@ -121,7 +121,8 @@ def config(args): args.num_y_steps = len(args.anchor_y_steps) # ddp related - args.dist = True + # args.dist = True + args.dist = False args.sync_bn = True args.cudnn = True diff --git a/data/Load_Data.py b/data/Load_Data.py index cdfe933..1239759 100644 --- a/data/Load_Data.py +++ b/data/Load_Data.py @@ -2570,9 +2570,9 @@ def get_loader(transformed_dataset, args): worker_init_fn=seed_worker, generator=g) - if args.dist: - return data_loader, data_sampler - return data_loader + + return data_loader, data_sampler + def compute_2d_lanes(pred_anchor, h_samples, H_g2im, anchor_x_steps, anchor_y_steps, x_min, x_max, prob_th=0.5): diff --git a/experiments/ddp.py b/experiments/ddp.py index 148dfe4..3490f65 100644 --- a/experiments/ddp.py +++ b/experiments/ddp.py @@ -30,12 +30,12 @@ def setup_dist_launch(args): # 通过launch.json 文件中定义的PyTorch分布式启动器获取总进程数 # world_size = int(os.getenv('WORLD_SIZE', 1))*args.nodes - world_size = int(os.environ['WORLD_SIZE']) + args.world_size = int(os.environ['WORLD_SIZE']) print("proc_id: " + str(args.proc_id)) - print("world size: " + str(world_size)) + print("world size: " + str(args.world_size)) print("local_rank: " + str(args.local_rank)) - os.environ['WORLD_SIZE'] = str(world_size) + os.environ['WORLD_SIZE'] = str(args.world_size) os.environ['RANK'] = str(args.proc_id) os.environ['LOCAL_RANK'] = str(args.local_rank) diff --git a/experiments/runner.py b/experiments/runner.py index 2a516b8..227ceee 100644 --- a/experiments/runner.py +++ b/experiments/runner.py @@ -48,8 +48,9 @@ def __init__(self, args): if args.proc_id == 0: if not args.no_cuda and not torch.cuda.is_available(): raise Exception("No gpu available for usage") - if int(os.getenv('WORLD_SIZE', 1)) >= 1: # WORLD_SIZE = 参与训练的总进程数。单机多卡时,进程数等于GPU数量 - print("Let's use", os.environ['WORLD_SIZE'], "GPUs!") + # if int(os.getenv('WORLD_SIZE', 1)) >= 1: # WORLD_SIZE = 参与训练的总进程数。单机多卡时,进程数等于GPU数量 + if args.world_size >= 1: + print("Let's use", args.world_size , "GPUs!") torch.cuda.empty_cache() save_id = args.mod # 训练模型名称 @@ -115,10 +116,17 @@ def train(self): train_sampler = self.train_sampler # Define model or resume - if args.model_name == "PersFormer": - model, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_ddp() - elif args.model_name == "GenLaneNet": - model1, model2, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_ddp() + # Choose model setup based on distributed flag + if args.dist: # 多卡训练 + if args.model_name == "PersFormer": + model, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_multi_gpu() + elif args.model_name == "GenLaneNet": + model1, model2, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_multi_gpu() + else: # 单卡训练 + if args.model_name == "PersFormer": + model, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_single() + elif args.model_name == "GenLaneNet": + model1, model2, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 = self._get_model_single() criterion = self.criterion if not args.no_cuda: @@ -223,7 +231,14 @@ def train(self): # 3D loss loss_3d, loss_3d_dict = criterion(output_net, gt, pred_hcam, gt_hcam, pred_pitch, gt_pitch) # Add laneatt loss - loss_att, loss_att_dict = model.module.laneatt_head.loss(laneatt_proposals_list, gt_laneline_img, + # 获取原始模型(无论是否 DDP) + if hasattr(model, 'module'): + # 多卡模式:model 是 DDP 包装的 + actual_model = model.module + else: + # 单卡模式:model 就是原始模型 + actual_model = model + loss_att, loss_att_dict = actual_model.laneatt_head.loss(laneatt_proposals_list, gt_laneline_img, cls_loss_weight=args.cls_loss_weight, reg_vis_loss_weight=args.reg_vis_loss_weight) @@ -743,6 +758,53 @@ def load_my_state_dict(self, model, state_dict): # custom function to load mode cnt += 1 print('#reused param: {}'.format(cnt)) return model + def _get_model_single(self): + args = self.args + device = torch.device("cuda" if not args.no_cuda and torch.cuda.is_available() else "cpu") + + # Define network + if args.model_name == "PersFormer": + model = PersFormer(args) + model = model.to(device) + elif args.model_name == "GenLaneNet": + model1 = erfnet.ERFNet(args.num_class) + model2 = GeoNet3D_ext.Net(args, input_dim=args.num_class - 1) + define_init_weights(model2, args.weight_init) + model1 = model1.to(device) + model2 = model2.to(device) + + # Logging setup + best_epoch = 0 + lowest_loss = np.inf + best_f1_epoch = 0 + best_val_f1 = -1e-5 + + # Resume or pretrained (only for PersFormer in single mode) + if args.resume: + # 注意:单卡 resume 不需要 dist.barrier + model, best_epoch, lowest_loss, best_f1_epoch, best_val_f1, _, _ = self.resume_model(args, model) + elif args.pretrained and args.model_name == "PersFormer": + path = 'models/pretrain/model_pretrain.pth.tar' + if os.path.isfile(path): + checkpoint = torch.load(path, map_location=device) + model.load_state_dict(checkpoint['state_dict']) + print(f"Use pretrained model in {path}") + else: + raise FileNotFoundError(f"No pretrained model found in {path}") + + # Optimizer & Scheduler + if args.model_name == "PersFormer": + optimizer = define_optim(args.optimizer, model.parameters(), + args.learning_rate, args.weight_decay) + elif args.model_name == "GenLaneNet": + optimizer = define_optim(args.optimizer, model2.parameters(), + args.learning_rate, args.weight_decay) + scheduler = define_scheduler(optimizer, args) + + if args.model_name == "PersFormer": + return model, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 + elif args.model_name == "GenLaneNet": + return model1, model2, optimizer, scheduler, best_epoch, lowest_loss, best_f1_epoch, best_val_f1 def _get_model_ddp(self): args = self.args @@ -919,40 +981,57 @@ def compute_loss(self, args, epoch, loss_3d, loss_att, loss_seg, uncertainty_los return loss def reduce_all_loss(self, args, loss_list, loss, loss_3d_dict, loss_att_dict, num): - reduced_loss = loss.data - reduced_loss_all = reduce_tensors(reduced_loss, world_size=args.world_size) - losses = loss_list[0] - losses.update(to_python_float(reduced_loss_all), num) - - reduced_vis_loss = loss_3d_dict['vis_loss'].data - reduced_vis_loss = reduce_tensors(reduced_vis_loss, world_size=args.world_size) - losses_3d_vis = loss_list[1] - losses_3d_vis.update(to_python_float(reduced_vis_loss), num) - - reduced_prob_loss = loss_3d_dict['prob_loss'].data - reduced_prob_loss = reduce_tensors(reduced_prob_loss, world_size=args.world_size) - losses_3d_prob = loss_list[2] - losses_3d_prob.update(to_python_float(reduced_prob_loss), num) - - reduced_reg_loss = loss_3d_dict['reg_loss'].data - reduced_reg_loss = reduce_tensors(reduced_reg_loss, world_size=args.world_size) - losses_3d_reg = loss_list[3] - losses_3d_reg.update(to_python_float(reduced_reg_loss), num) - - reduce_2d_vis_loss = loss_att_dict['vis_loss'].data - reduce_2d_vis_loss = reduce_tensors(reduce_2d_vis_loss, world_size=args.world_size) - losses_2d_vis = loss_list[4] - losses_2d_vis.update(to_python_float(reduce_2d_vis_loss), num) - - reduced_2d_cls_loss = loss_att_dict['cls_loss'].data - reduced_2d_cls_loss = reduce_tensors(reduced_2d_cls_loss, world_size=args.world_size) - losses_2d_cls = loss_list[5] - losses_2d_cls.update(to_python_float(reduced_2d_cls_loss), num) - - reduced_2d_reg_loss = loss_att_dict['reg_loss'].data - reduced_2d_reg_loss = reduce_tensors(reduced_2d_reg_loss, world_size=args.world_size) - losses_2d_reg = loss_list[6] - losses_2d_reg.update(to_python_float(reduced_2d_reg_loss), num) + if args.dist: + # 多卡:进行分布式 reduce + reduced_loss = reduce_tensors(loss.data, world_size=args.world_size) + losses = loss_list[0] + losses.update(to_python_float(reduced_loss), num) + + reduced_vis_loss = reduce_tensors(loss_3d_dict['vis_loss'].data, world_size=args.world_size) + losses_3d_vis = loss_list[1] + losses_3d_vis.update(to_python_float(reduced_vis_loss), num) + + reduced_prob_loss = reduce_tensors(loss_3d_dict['prob_loss'].data, world_size=args.world_size) + losses_3d_prob = loss_list[2] + losses_3d_prob.update(to_python_float(reduced_prob_loss), num) + + reduced_reg_loss = reduce_tensors(loss_3d_dict['reg_loss'].data, world_size=args.world_size) + losses_3d_reg = loss_list[3] + losses_3d_reg.update(to_python_float(reduced_reg_loss), num) + + reduce_2d_vis_loss = reduce_tensors(loss_att_dict['vis_loss'].data, world_size=args.world_size) + losses_2d_vis = loss_list[4] + losses_2d_vis.update(to_python_float(reduce_2d_vis_loss), num) + + reduced_2d_cls_loss = reduce_tensors(loss_att_dict['cls_loss'].data, world_size=args.world_size) + losses_2d_cls = loss_list[5] + losses_2d_cls.update(to_python_float(reduced_2d_cls_loss), num) + + reduced_2d_reg_loss = reduce_tensors(loss_att_dict['reg_loss'].data, world_size=args.world_size) + losses_2d_reg = loss_list[6] + losses_2d_reg.update(to_python_float(reduced_2d_reg_loss), num) + else: + # 单卡:直接使用本地 loss,不 reduce + losses = loss_list[0] + losses.update(to_python_float(loss.data), num) + + losses_3d_vis = loss_list[1] + losses_3d_vis.update(to_python_float(loss_3d_dict['vis_loss'].data), num) + + losses_3d_prob = loss_list[2] + losses_3d_prob.update(to_python_float(loss_3d_dict['prob_loss'].data), num) + + losses_3d_reg = loss_list[3] + losses_3d_reg.update(to_python_float(loss_3d_dict['reg_loss'].data), num) + + losses_2d_vis = loss_list[4] + losses_2d_vis.update(to_python_float(loss_att_dict['vis_loss'].data), num) + + losses_2d_cls = loss_list[5] + losses_2d_cls.update(to_python_float(loss_att_dict['cls_loss'].data), num) + + losses_2d_reg = loss_list[6] + losses_2d_reg.update(to_python_float(loss_att_dict['reg_loss'].data), num) return loss_list diff --git a/experiments/single_gpu.py b/experiments/single_gpu.py new file mode 100644 index 0000000..a90cd00 --- /dev/null +++ b/experiments/single_gpu.py @@ -0,0 +1,38 @@ +# utils/utils.py + +import torch +import numpy as np +import random +import os + +def init_single_gpu(args): + """ + Initialize necessary attributes for single-GPU (non-distributed) training. + Should be called when args.distributed == False. + """ + args.distributed = False + args.rank = 0 + args.local_rank = 0 + args.world_size = 1 + args.proc_id = 0 + args.gpu = 0 # 默认使用 GPU 0;也可用 torch.cuda.current_device() 如果已设 CUDA_VISIBLE_DEVICES + args.is_main_process = True + + # Set device + if not args.no_cuda and torch.cuda.is_available(): + torch.cuda.set_device(args.gpu) + args.device = torch.device(f'cuda:{args.gpu}') + else: + args.device = torch.device('cpu') + + # Set random seeds for reproducibility + torch.manual_seed(0) + np.random.seed(0) + random.seed(0) + + # CuDNN settings: enable benchmark for speed in single-GPU + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False + + # Optional: log info + print(f"[Single-GPU] Using device: {args.device}") \ No newline at end of file diff --git a/main_persformer.py b/main_persformer.py index 3d650ae..4952304 100644 --- a/main_persformer.py +++ b/main_persformer.py @@ -18,6 +18,7 @@ from config import persformer_openlane, persformer_once, persformer_apollo from utils.utils import * from experiments.ddp import * +from experiments.single_gpu import * from experiments.runner import * @@ -30,7 +31,11 @@ def main(): # persformer_once.config(args) persformer_openlane.config(args) # initialize distributed data parallel set - ddp_init(args) + if args.distributed: + ddp_init(args) + else: + init_single_gpu(args) + # define runner to begin training or evaluation runner = Runner(args) # args.evaluate = True diff --git a/utils/utils.py b/utils/utils.py index 9d91da8..09b0f40 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -123,7 +123,7 @@ def define_args(): parser.add_argument('--print_freq', type=int, default=500, help='padding') parser.add_argument('--save_freq', type=int, default=500, help='padding') # DDP setting - parser.add_argument('--distributed', action='store_true') + parser.add_argument('--distributed', action='store_true', help="Use DDP for training") parser.add_argument("--local_rank", type=int) parser.add_argument('--gpu', type=int, default = 0) parser.add_argument('--world_size', type=int, default = 1)