Skip to content

[BUG] Interrupting training fails to do a graceful shutdown #7

@sdatkinson

Description

@sdatkinson

nam v0.9.0.

Stack trace:

Detected KeyboardInterrupt, attempting graceful shutdown ...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     46             return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 47         return trainer_fn(*args, **kwargs)
     48 

25 frames
[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    573         )
--> 574         self._run(model, ckpt_path=ckpt_path)
    575 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _run(self, model, ckpt_path)
    980         # ----------------------------
--> 981         results = self._run_stage()
    982 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in _run_stage(self)
   1024             with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1025                 self.fit_loop.run()
   1026             return None

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py](https://localhost:8080/#) in run(self)
    204                 self.on_advance_start()
--> 205                 self.advance()
    206                 self.on_advance_end()

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py](https://localhost:8080/#) in advance(self)
    362             assert self._data_fetcher is not None
--> 363             self.epoch_loop.run(self._data_fetcher)
    364 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py](https://localhost:8080/#) in run(self, data_fetcher)
    139             try:
--> 140                 self.advance(data_fetcher)
    141                 self.on_advance_end(data_fetcher)

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py](https://localhost:8080/#) in advance(self, data_fetcher)
    249                     # in automatic optimization, there can only be one optimizer
--> 250                     batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
    251                 else:

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py](https://localhost:8080/#) in run(self, optimizer, batch_idx, kwargs)
    189         else:
--> 190             self._optimizer_step(batch_idx, closure)
    191 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py](https://localhost:8080/#) in _optimizer_step(self, batch_idx, train_step_and_backward_closure)
    267         # model hook
--> 268         call._call_lightning_module_hook(
    269             trainer,

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
    166     with trainer.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"):
--> 167         output = fn(*args, **kwargs)
    168 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/module.py](https://localhost:8080/#) in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
   1305         """
-> 1306         optimizer.step(closure=optimizer_closure)
   1307 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/optimizer.py](https://localhost:8080/#) in step(self, closure, **kwargs)
    152         assert self._strategy is not None
--> 153         step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
    154 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py](https://localhost:8080/#) in optimizer_step(self, optimizer, closure, model, **kwargs)
    237         assert isinstance(model, pl.LightningModule)
--> 238         return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
    239 

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py](https://localhost:8080/#) in optimizer_step(self, optimizer, model, closure, **kwargs)
    121         closure = partial(self._wrap_closure, model, optimizer, closure)
--> 122         return optimizer.step(closure=closure, **kwargs)
    123 

[/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
    129                     opt._opt_called = True  # type: ignore[union-attr]
--> 130                     return func.__get__(opt, opt.__class__)(*args, **kwargs)
    131 

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in wrapper(*args, **kwargs)
    483 
--> 484                 out = func(*args, **kwargs)
    485                 self._optimizer_step_code()

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in _use_grad(self, *args, **kwargs)
     88             torch._dynamo.graph_break()
---> 89             ret = func(self, *args, **kwargs)
     90         finally:

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in step(self, closure)
    225 
--> 226             adam(
    227                 params_with_grad,

[/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py](https://localhost:8080/#) in maybe_fallback(*args, **kwargs)
    160             else:
--> 161                 return func(*args, **kwargs)
    162 

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in adam(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)
    765 
--> 766     func(
    767         params,

[/usr/local/lib/python3.10/dist-packages/torch/optim/adam.py](https://localhost:8080/#) in _multi_tensor_adam(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, has_complex, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable)
    517         if device_state_steps[0].is_cpu:
--> 518             torch._foreach_add_(
    519                 device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

NameError                                 Traceback (most recent call last)
[<ipython-input-1-2c9b0f02eea8>](https://localhost:8080/#) in <cell line: 69>()
     67 
     68 get_ipython().run_line_magic('tensorboard', '--logdir /content/lightning_logs')
---> 69 run(
     70     epochs=epochs,
     71     architecture=architecture,

[/usr/local/lib/python3.10/dist-packages/nam/train/colab.py](https://localhost:8080/#) in run(epochs, delay, model_type, architecture, lr, lr_decay, seed, user_metadata, ignore_checks, fit_cab)
     98     input_version, input_basename = _check_for_files()
     99 
--> 100     train_output: TrainOutput = train(
    101         input_basename,
    102         _OUTPUT_BASENAME,

[/usr/local/lib/python3.10/dist-packages/nam/train/core.py](https://localhost:8080/#) in train(input_path, output_path, train_path, input_version, epochs, delay, latency, model_type, architecture, batch_size, ny, lr, lr_decay, seed, save_plot, silent, modelname, ignore_checks, local, fit_cab, threshold_esr, user_metadata, fast_dev_run)
   1426     # Suppress the PossibleUserWarning about num_workers (Issue 345)
   1427     with filter_warnings("ignore", category=PossibleUserWarning):
-> 1428         trainer.fit(model, train_dataloader, val_dataloader)
   1429 
   1430     # Go to best checkpoint

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py](https://localhost:8080/#) in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    536         self.state.status = TrainerStatus.RUNNING
    537         self.training = True
--> 538         call._call_and_handle_interrupt(
    539             self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    540         )

[/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py](https://localhost:8080/#) in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     62         if isinstance(launcher, _SubprocessScriptLauncher):
     63             launcher.kill(_get_sigkill_signal())
---> 64         exit(1)
     65 
     66     except BaseException as exception:

NameError: name 'exit' is not defined

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions