-
Notifications
You must be signed in to change notification settings - Fork 68
Expand file tree
/
Copy pathtrain.sh
More file actions
89 lines (72 loc) · 2.2 KB
/
train.sh
File metadata and controls
89 lines (72 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/bash
ckpt_dir="your_project_name"
log_dir="logs/$ckpt_dir"
dataset_config="ThinkSound/configs/multimodal_dataset_demo.json"
model_config="ThinkSound/configs/model_configs/thinksound.json"
pretransform_ckpt_path="ckpts/vae.ckpt"
#export MASTER_ADDR="10.32.3.240"
export MASTER_PORT="9511"
# pip install git+https://github.com/patrick-kidger/torchcubicspline.git
debug_mode="false"
node_rank=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--debug) debug_mode="true"; shift ;;
--node-rank) node_rank="$2"; shift; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
done
export NODE_RANK=$node_rank
export WORLD_SIZE=8
mkdir demos
if [ "$debug_mode" != "true" ]; then
mkdir -p "$log_dir"
cp "$dataset_config" "$log_dir/"
cp "$model_config" "$log_dir/"
cp "$0" "$log_dir/"
fi
if [ "$debug_mode" == "true" ]; then
num_gpus=1
num_nodes=1
else
num_gpus=8
num_nodes=1
fi
echo "Training Configuration:"
echo "Checkpoint Directory: $ckpt_dir"
echo "Log Directory: $log_dir"
echo "Dataset Config: $dataset_config"
echo "Model Config: $model_config"
echo "Pretransform Checkpoint Path: $pretransform_ckpt_path"
echo "Num GPUs: $num_gpus"
echo "Num Nodes: $num_nodes"
echo "Batch Size: 32"
echo "Num Workers: 24"
echo "Node Rank: $node_rank"
if [ "$debug_mode" == "true" ]; then
nohup python train.py \
--dataset-config "$dataset_config" \
--model-config "$model_config" \
--name "$ckpt_dir" \
--save-dir "logs/" \
--pretransform-ckpt-path "$pretransform_ckpt_path" \
--checkpoint-every 2000 \
--num-gpus "$num_gpus" \
--num-nodes "$num_nodes" \
--batch-size 32 \
--num-workers 24
else
nohup python train.py \
--dataset-config "$dataset_config" \
--model-config "$model_config" \
--name "$ckpt_dir" \
--save-dir "logs/" \
--pretransform-ckpt-path "$pretransform_ckpt_path" \
--checkpoint-every 4000 \
--num-gpus "$num_gpus" \
--num-nodes "$num_nodes" \
--batch-size 32 \
--num-workers 24 \
> "$log_dir/train.log" 2>&1 &
echo "Training started. Logs can be found in $log_dir/train.log"
fi