[BUG] Interrupting training fails to do a graceful shutdown

nam v0.9.0.

Stack trace:

```
Detected KeyboardInterrupt, attempting graceful shutdown ...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     46             return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 47         return trainer_fn(*args, **kwargs)
     48 

25 frames
[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    573         )
--> 574         self._run(model, ckpt_path=ckpt_path)
    575 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _run(self, model, ckpt_path)
    980         # ----------------------------
--> 981         results = self._run_stage()
    982 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _run_stage(self)
   1024             with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1025                 self.fit_loop.run()
   1026             return None

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py](https://localhost:8080/#) in run(self)
    204                 self.on_advance_start()
--> 205                 self.advance()
    206                 self.on_advance_end()

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py](https://localhost:8080/#) in advance(self)
    362             assert self._data_fetcher is not None
--> 363             self.epoch_loop.run(self._data_fetcher)
    364 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py](https://localhost:8080/#) in run(self, data_fetcher)
    139             try:
--> 140                 self.advance(data_fetcher)
    141                 self.on_advance_end(data_fetcher)

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py](https://localhost:8080/#) in advance(self, data_fetcher)
    249                     # in automatic optimization, there can only be one optimizer
--> 250                     batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
    251                 else:

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py](https://localhost:8080/#) in run(self, optimizer, batch_idx, kwargs)
    189         else:
--> 190             self._optimizer_step(batch_idx, closure)
    191 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py](https://localhost:8080/#) in _optimizer_step(self, batch_idx, train_step_and_backward_closure)
    267         # model hook
--> 268         call._call_lightning_module_hook(
    269             trainer,

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
    166     with trainer.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"):
--> 167         output = fn(*args, **kwargs)
    168 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/module.py](https://localhost:8080/#) in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
   1305         """
-> 1306         optimizer.step(closure=optimizer_closure)
   1307 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/optimizer.py](https://localhost:8080/#) in step(self, closure, **kwargs)
    152         assert self._strategy is not None
--> 153         step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
    154 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py](https://localhost:8080/#) in optimizer_step(self, optimizer, closure, model, **kwargs)
    237         assert isinstance(model, pl.LightningModule)
--> 238         return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
    239 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py](https://localhost:8080/#) in optimizer_step(self, optimizer, model, closure, **kwargs)
    121         closure = partial(self._wrap_closure, model, optimizer, closure)
--> 122         return optimizer.step(closure=closure, **kwargs)
    123 

[/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
    129                     opt._opt_called = True  # type: ignore[union-attr]
--> 130                     return func.__get__(opt, opt.__class__)(*args, **kwargs)
    131 

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
    483 
--> 484                 out = func(*args, **kwargs)
    485                 self._optimizer_step_code()

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in _use_grad(self, *args, **kwargs)
     88             torch._dynamo.graph_break()
---> 89             ret = func(self, *args, **kwargs)
     90         finally:

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in step(self, closure)
    225 
--> 226             adam(
    227                 params_with_grad,

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in maybe_fallback(*args, **kwargs)
    160             else:
--> 161                 return func(*args, **kwargs)
    162 

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in adam(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)
    765 
--> 766     func(
    767         params,

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in _multi_tensor_adam(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, has_complex, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable)
    517         if device_state_steps[0].is_cpu:
--> 518             torch._foreach_add_(
    519                 device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

NameError                                 Traceback (most recent call last)
[<ipython-input-1-2c9b0f02eea8>](https://localhost:8080/#) in <cell line: 69>()
     67 
     68 get_ipython().run_line_magic('tensorboard', '--logdir /content/lightning_logs')
---> 69 run(
     70     epochs=epochs,
     71     architecture=architecture,

[/usr/local/lib/python3.10/dist-packages/nam/train/colab.py](https://localhost:8080/#) in run(epochs, delay, model_type, architecture, lr, lr_decay, seed, user_metadata, ignore_checks, fit_cab)
     98     input_version, input_basename = _check_for_files()
     99 
--> 100     train_output: TrainOutput = train(
    101         input_basename,
    102         _OUTPUT_BASENAME,

[/usr/local/lib/python3.10/dist-packages/nam/train/core.py](https://localhost:8080/#) in train(input_path, output_path, train_path, input_version, epochs, delay, latency, model_type, architecture, batch_size, ny, lr, lr_decay, seed, save_plot, silent, modelname, ignore_checks, local, fit_cab, threshold_esr, user_metadata, fast_dev_run)
   1426     # Suppress the PossibleUserWarning about num_workers (Issue 345)
   1427     with filter_warnings("ignore", category=PossibleUserWarning):
-> 1428         trainer.fit(model, train_dataloader, val_dataloader)
   1429 
   1430     # Go to best checkpoint

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    536         self.state.status = TrainerStatus.RUNNING
    537         self.training = True
--> 538         call._call_and_handle_interrupt(
    539             self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    540         )

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     62         if isinstance(launcher, _SubprocessScriptLauncher):
     63             launcher.kill(_get_sigkill_signal())
---> 64         exit(1)
     65 
     66     except BaseException as exception:

NameError: name 'exit' is not defined
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[BUG] Interrupting training fails to do a graceful shutdown #7

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

[BUG] Interrupting training fails to do a graceful shutdown #7

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions