-
Notifications
You must be signed in to change notification settings - Fork 68
Expand file tree
/
Copy patheval_batch.py
More file actions
136 lines (107 loc) · 4.86 KB
/
eval_batch.py
File metadata and controls
136 lines (107 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from prefigure.prefigure import get_all_args, push_wandb_config
import json
import os
import re
import torch
import torchaudio
from lightning.pytorch import seed_everything
import random
from datetime import datetime
import numpy as np
from ThinkSound.data.datamodule import DataModule
from ThinkSound.models import create_model_from_config
from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
from ThinkSound.inference.sampling import sample, sample_discrete_euler
from pathlib import Path
from tqdm import tqdm
def predict_step(diffusion, batch, diffusion_objective, device='cuda:0'):
diffusion = diffusion.to(device)
reals, metadata = batch
ids = [item['id'] for item in metadata]
batch_size, length = reals.shape[0], reals.shape[2]
with torch.amp.autocast('cuda'):
conditioning = diffusion.conditioner(metadata, device)
video_exist = torch.stack([item['video_exist'] for item in metadata],dim=0)
conditioning['metaclip_features'][~video_exist] = diffusion.model.model.empty_clip_feat
conditioning['sync_features'][~video_exist] = diffusion.model.model.empty_sync_feat
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
if batch_size > 1:
noise_list = []
for _ in range(batch_size):
noise_1 = torch.randn([1, diffusion.io_channels, length]).to(device) # 每次生成推进RNG状态
noise_list.append(noise_1)
noise = torch.cat(noise_list, dim=0)
else:
noise = torch.randn([batch_size, diffusion.io_channels, length]).to(device)
with torch.amp.autocast('cuda'):
model = diffusion.model
if diffusion_objective == "v":
fakes = sample(model, noise, 24, 0, **cond_inputs, cfg_scale=5, batch_cfg=True)
elif diffusion_objective == "rectified_flow":
fakes = sample_discrete_euler(model, noise, 24, **cond_inputs, cfg_scale=5, batch_cfg=True)
if diffusion.pretransform is not None:
fakes = diffusion.pretransform.decode(fakes)
audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
return audios
def main():
args = get_all_args()
if args.save_dir == '':
args.save_dir = args.results_dir
seed = args.seed
if os.environ.get("SLURM_PROCID") is not None:
seed += int(os.environ.get("SLURM_PROCID"))
seed_everything(seed, workers=True)
# Load config
if args.model_config == '':
args.model_config = "ThinkSound/configs/model_configs/thinksound.json"
with open(args.model_config) as f:
model_config = json.load(f)
duration = float(args.duration_sec)
sample_rate = model_config["sample_rate"]
latent_length = round(44100 / 64 / 32 * duration)
model_config["sample_size"] = duration * sample_rate
model_config["model"]["diffusion"]["config"]["sync_seq_len"] = 24 * int(duration)
model_config["model"]["diffusion"]["config"]["clip_seq_len"] = 8 * int(duration)
model_config["model"]["diffusion"]["config"]["latent_seq_len"] = latent_length
model = create_model_from_config(model_config)
if args.compile:
model = torch.compile(model)
model.load_state_dict(torch.load(args.ckpt_dir))
vae_state = load_ckpt_state_dict(args.pretransform_ckpt_path, prefix='autoencoder.')
model.pretransform.load_state_dict(vae_state)
if args.dataset_config == '':
args.dataset_config = "ThinkSound/configs/multimodal_dataset_demo.json"
with open(args.dataset_config) as f:
dataset_config = json.load(f)
for td in dataset_config["test_datasets"]:
td["path"] = args.results_dir
dm = DataModule(
dataset_config,
batch_size=args.batch_size,
test_batch_size=args.test_batch_size,
num_workers=args.num_workers,
sample_rate=model_config["sample_rate"],
sample_size=(float)(args.duration_sec) * model_config["sample_rate"],
audio_channels=model_config.get("audio_channels", 2),
latent_length=round(44100/64/32*duration),
)
dm.setup('predict')
dl = dm.predict_dataloader()
current_date = datetime.now()
formatted_date = current_date.strftime('%m%d')
audio_dir = os.path.join(args.save_dir,f'{formatted_date}_batch_size'+str(args.test_batch_size))
os.makedirs(audio_dir,exist_ok=True)
for batch in tqdm(dl, desc="Predicting"):
audio = predict_step(
model,
batch=batch,
diffusion_objective=model_config["model"]["diffusion"]["diffusion_objective"],
device='cuda:0'
)
_, metadata = batch
ids = [item['id'] for item in metadata]
for i in range(audio.size(0)):
id_str = ids[i] if i < len(ids) else f"unknown_{i}"
torchaudio.save(os.path.join(audio_dir, f"{id_str}.wav"), audio[i], 44100)
if __name__ == '__main__':
main()