diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 9ae7f9ef..faa12efc 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -299,12 +299,10 @@ def optimize( or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the - specified output directory. Output filenames correspond to the template - '--' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. + specified output directory. Optimize output filenames correspond to + the template '-' plus the file suffix which is + either '.csv' for the CmdStan output or '_stdout.txt' for + the console messages, e.g. 'bernoulli-20251107142835.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, @@ -339,7 +337,7 @@ def optimize( :param save_profile: Whether or not to profile auto-diff operations in labelled blocks of code. If ``True``, CSV outputs are written to - file '--profile-'. + file '-_profile.csv'. Introduced in CmdStan-2.26. :param algorithm: Algorithm to use. One of: 'BFGS', 'LBFGS', 'Newton' @@ -514,11 +512,15 @@ def sample( Output files are either written to a temporary directory or to the specified output directory. Ouput filenames correspond to the template - '--' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. + '-' plus additional bits to identify which + output file it corresponds to. CmdStan output will suffix with + '_.csv' if there is more than one chain, and simply'.csv' + in the single-chain case. For example, 'bernoulli-20251107144515_1.csv'. + Console message output is written to a text file suffixed + `_stdout_.txt` if each chain executes in a separate process + (default behavior) or simply `_stdout.txt` if done so in a single + process, such as when STAN_THREADS is enabled and you are sampling + more than one chain. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, @@ -651,14 +653,17 @@ def sample( :param save_latent_dynamics: Whether or not to output the position and momentum information for the model parameters (unconstrained). If ``True``, CSV outputs are written to an output file - '--diagnostic-', - e.g. 'bernoulli-201912081451-diagnostic-1.csv', see + '-_diagnostic_', + e.g. 'bernoulli-201912081451_diagnostic_1.csv', see https://mc-stan.org/docs/cmdstan-guide/stan_csv.html, section "Diagnostic CSV output file" for details. :param save_profile: Whether or not to profile auto-diff operations in labelled blocks of code. If ``True``, CSV outputs are written to - file '--profile-'. + file '-_profile_.csv' if each + chain runs in its own process, otherwise + '-_profile.csv' if all chains run in a + single process. Introduced in CmdStan-2.26, see https://mc-stan.org/docs/cmdstan-guide/stan_csv.html, section "Profiling CSV output file" for details. @@ -983,12 +988,16 @@ def generate_quantities( or to a temporary directory which is deleted upon session exit. Output files are either written to a temporary directory or to the - specified output directory. Output filenames correspond to the template - '--' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. + specified output directory. Ouput filenames correspond to the template + '-' plus additional bits to identify which + output file it corresponds to. CmdStan output will suffix with + '_.csv' if there is more than one chain, and simply'.csv' + in the single-chain case. For example, 'bernoulli-20251107144515_1.csv'. + Console message output is written to a text file suffixed + `_stdout_.txt` if each chain executes in a separate process + (default behavior) or simply `_stdout.txt` if done so in a single + process, such as when STAN_THREADS is enabled and you are sampling + more than one chain. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, @@ -1175,11 +1184,9 @@ def variational( Output files are either written to a temporary directory or to the specified output directory. Output filenames correspond to the template - '--' plus the file suffix which is - either '.csv' for the CmdStan output or '.txt' for - the console messages, e.g. 'bernoulli-201912081451-1.csv'. - Output files written to the temporary directory contain an additional - 8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'. + '-' plus the file suffix which is + either '.csv' for the CmdStan output or '_stdout.txt' for + the console messages, e.g. 'bernoulli-201912081451.csv'. :param data: Values for all data variables in the model, specified either as a dictionary with entries matching the data variables, @@ -1458,7 +1465,7 @@ def pathfinder( :param save_profile: Whether or not to profile auto-diff operations in labelled blocks of code. If ``True``, CSV outputs are written to - file '--profile-'. + file '-_profile.csv'. Introduced in CmdStan-2.26, see https://mc-stan.org/docs/cmdstan-guide/stan_csv.html, section "Profiling CSV output file" for details. @@ -1706,7 +1713,7 @@ def laplace_sample( :param save_profile: Whether or not to profile auto-diff operations in labelled blocks of code. If ``True``, CSV outputs are written to - file '--profile-'. + file '-_profile.csv'. Introduced in CmdStan-2.26, see https://mc-stan.org/docs/cmdstan-guide/stan_csv.html, section "Profiling CSV output file" for details. diff --git a/cmdstanpy/stanfit/gq.py b/cmdstanpy/stanfit/gq.py index 6a1bfc63..184d7d45 100644 --- a/cmdstanpy/stanfit/gq.py +++ b/cmdstanpy/stanfit/gq.py @@ -705,10 +705,7 @@ def _previous_draws_pd( def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index 314cccf6..f485b4ec 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -310,10 +310,7 @@ def column_names(self) -> tuple[str, ...]: def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy/stanfit/mcmc.py b/cmdstanpy/stanfit/mcmc.py index 7400db9f..03bf2234 100644 --- a/cmdstanpy/stanfit/mcmc.py +++ b/cmdstanpy/stanfit/mcmc.py @@ -824,10 +824,7 @@ def method_variables(self) -> dict[str, np.ndarray]: def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy/stanfit/mle.py b/cmdstanpy/stanfit/mle.py index 276a5496..03b44799 100644 --- a/cmdstanpy/stanfit/mle.py +++ b/cmdstanpy/stanfit/mle.py @@ -295,10 +295,7 @@ def stan_variables( def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy/stanfit/pathfinder.py b/cmdstanpy/stanfit/pathfinder.py index 8549c78c..e18a76c4 100644 --- a/cmdstanpy/stanfit/pathfinder.py +++ b/cmdstanpy/stanfit/pathfinder.py @@ -216,10 +216,7 @@ def is_resampled(self) -> bool: def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy/stanfit/runset.py b/cmdstanpy/stanfit/runset.py index cbcb7e7c..f88f55f3 100644 --- a/cmdstanpy/stanfit/runset.py +++ b/cmdstanpy/stanfit/runset.py @@ -38,10 +38,7 @@ def __init__( self._args = args self._chains = chains self._one_process_per_chain = one_process_per_chain - if one_process_per_chain: - self._num_procs = chains - else: - self._num_procs = 1 + self._num_procs = chains if one_process_per_chain else 1 self._retcodes = [-1 for _ in range(self._num_procs)] self._timeout_flags = [False for _ in range(self._num_procs)] if chain_ids is None: @@ -49,51 +46,51 @@ def __init__( self._chain_ids = chain_ids if args.output_dir is not None: - self._output_dir = args.output_dir - else: - # make a per-run subdirectory of our master temp directory - self._output_dir = tempfile.mkdtemp( - prefix=args.model_name, dir=_TMPDIR - ) + self._outdir = args.output_dir + else: # make a per-run subdirectory of our master temp directory + self._outdir = tempfile.mkdtemp(prefix=args.model_name, dir=_TMPDIR) # output files prefix: ``-_`` self._base_outfile = ( f'{args.model_name}-{datetime.now().strftime(time_fmt)}' ) - # per-process outputs - self._stdout_files = [''] * self._num_procs - self._profile_files = [''] * self._num_procs # optional - if one_process_per_chain: - for i in range(chains): - self._stdout_files[i] = self.file_path("-stdout.txt", id=i) - if args.save_profile: - self._profile_files[i] = self.file_path( - ".csv", extra="-profile", id=chain_ids[i] - ) + self._stdout_files, self._profile_files = [], [] + self._csv_files, self._diagnostic_files = [], [] + + # per-process output files + if one_process_per_chain and chains > 1: + self._stdout_files = [ + self.gen_file_name(".txt", extra="stdout", id=id) + for id in self._chain_ids + ] + if args.save_profile: + self._profile_files = [ + self.gen_file_name(".csv", extra="profile", id=id) + for id in self._chain_ids + ] else: - self._stdout_files[0] = self.file_path("-stdout.txt") + self._stdout_files = [self.gen_file_name(".txt", extra="stdout")] if args.save_profile: - self._profile_files[0] = self.file_path( - ".csv", extra="-profile" - ) + self._profile_files = [ + self.gen_file_name(".csv", extra="profile") + ] # per-chain output files - self._csv_files: list[str] = [''] * chains - self._diagnostic_files = [''] * chains # optional - if chains == 1: - self._csv_files[0] = self.file_path(".csv") + self._csv_files = [self.gen_file_name(".csv")] if args.save_latent_dynamics: - self._diagnostic_files[0] = self.file_path( - ".csv", extra="-diagnostic" - ) + self._diagnostic_files = [ + self.gen_file_name(".csv", extra="diagnostic") + ] else: - for i in range(chains): - self._csv_files[i] = self.file_path(".csv", id=chain_ids[i]) - if args.save_latent_dynamics: - self._diagnostic_files[i] = self.file_path( - ".csv", extra="-diagnostic", id=chain_ids[i] - ) + self._csv_files = [ + self.gen_file_name(".csv", id=id) for id in self._chain_ids + ] + if args.save_latent_dynamics: + self._diagnostic_files = [ + self.gen_file_name(".csv", extra="diagnostic", id=id) + for id in self._chain_ids + ] def __repr__(self) -> str: repr = 'RunSet: chains={}, chain_ids={}, num_processes={}'.format( @@ -173,14 +170,14 @@ def cmd(self, idx: int) -> list[str]: else: return self._args.compose_command( idx, - csv_file=self.file_path('.csv'), + csv_file=self.gen_file_name('.csv'), diagnostic_file=( - self.file_path(".csv", extra="-diagnostic") + self.gen_file_name(".csv", extra="diagnostic") if self._args.save_latent_dynamics else None ), profile_file=( - self.file_path(".csv", extra="-profile") + self.gen_file_name(".csv", extra="profile") if self._args.save_profile else None ), @@ -201,10 +198,7 @@ def stdout_files(self) -> list[str]: def _check_retcodes(self) -> bool: """Returns ``True`` when all chains have retcode 0.""" - for code in self._retcodes: - if code != 0: - return False - return True + return all(retcode == 0 for retcode in self._retcodes) @property def diagnostic_files(self) -> list[str]: @@ -216,16 +210,17 @@ def profile_files(self) -> list[str]: """List of paths to CmdStan profiler files.""" return self._profile_files - # pylint: disable=invalid-name - def file_path( + def gen_file_name( self, suffix: str, *, extra: str = "", id: int | None = None ) -> str: + """Generate a standard file name according to CmdStan output pattern""" + file = self._base_outfile + if extra: + file += f"_{extra}" if id is not None: - suffix = f"_{id}{suffix}" - file = os.path.join( - self._output_dir, f"{self._base_outfile}{extra}{suffix}" - ) - return file + file += f"_{id}" + file += suffix + return os.path.join(self._outdir, file) def _retcode(self, idx: int) -> int: """Get retcode for process[idx].""" diff --git a/cmdstanpy/stanfit/vb.py b/cmdstanpy/stanfit/vb.py index dd00fc8f..7a1f59b0 100644 --- a/cmdstanpy/stanfit/vb.py +++ b/cmdstanpy/stanfit/vb.py @@ -249,10 +249,7 @@ def variational_sample_pd(self) -> pd.DataFrame: def save_csvfiles(self, dir: str | None = None) -> None: """ - Move output CSV files to specified directory. If files were - written to the temporary session directory, clean filename. - E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as - 'bernoulli-201912081451-1.csv'. + Move output CSV files to specified directory. :param dir: directory path diff --git a/cmdstanpy_tutorial.ipynb b/cmdstanpy_tutorial.ipynb index 38401605..28517d29 100644 --- a/cmdstanpy_tutorial.ipynb +++ b/cmdstanpy_tutorial.ipynb @@ -98,6 +98,7 @@ "CmdStanPy will use the following optional packages, if installed:\n", "\n", "* `xarray`, an n-dimension labeled dataset package which can be used for outputs\n", + "* `polars`, a highly-optimized data manipulation library, which can speed up processing outputs of large Stan models\n", "\n", "To install CmdStanPy with all the optional packages:\n", "\n", @@ -408,7 +409,7 @@ "hash": "d31ce8e45781476cfd394e192e0962028add96ff436d4fd4e560a347d206b9cb" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -422,9 +423,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.19" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docsrc/users-guide/outputs.rst b/docsrc/users-guide/outputs.rst index 56f650a5..96d25f19 100644 --- a/docsrc/users-guide/outputs.rst +++ b/docsrc/users-guide/outputs.rst @@ -8,9 +8,9 @@ CSV File Outputs Underlyingly, the CmdStan outputs are a set of per-chain `Stan CSV files `__. -The filenames follow the template '--' -plus the file suffix '.csv'. -CmdStanPy also captures the per-chain console and error messages. +The filenames follow the template '-_' +plus the file suffix '.csv'. CmdStanPy also captures the per-chain console and +error messages. .. ipython:: python diff --git a/test/test_generate_quantities.py b/test/test_generate_quantities.py index c65907b2..ab97219d 100644 --- a/test/test_generate_quantities.py +++ b/test/test_generate_quantities.py @@ -533,7 +533,7 @@ def test_serialization() -> None: fit1 = model.generate_quantities(data=jdata, previous_fit=fit_sampling) dumped = pickle.dumps(fit1) - shutil.rmtree(fit1.runset._output_dir) + shutil.rmtree(fit1.runset._outdir) fit2: CmdStanGQ[CmdStanMCMC] = pickle.loads(dumped) variables1 = fit1.stan_variables() variables2 = fit2.stan_variables() diff --git a/test/test_optimize.py b/test/test_optimize.py index 1060640e..2f255468 100644 --- a/test/test_optimize.py +++ b/test/test_optimize.py @@ -664,7 +664,7 @@ def test_serialization() -> None: history_size=5, ) dumped = pickle.dumps(mle1) - shutil.rmtree(mle1.runset._output_dir) + shutil.rmtree(mle1.runset._outdir) mle2: CmdStanMLE = pickle.loads(dumped) np.testing.assert_array_equal( mle1.optimized_params_np, mle2.optimized_params_np diff --git a/test/test_runset.py b/test/test_runset.py index d176e01e..4576d924 100644 --- a/test/test_runset.py +++ b/test/test_runset.py @@ -82,7 +82,7 @@ def test_get_err_msgs() -> None: assert 'Exception: variable does not exist' in errs -def test_output_filenames() -> None: +def test_output_filenames_one_proc_per_chain() -> None: exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') sampler_args = SamplerArgs() @@ -94,10 +94,137 @@ def test_output_filenames() -> None: data=jdata, method_args=sampler_args, ) - runset = RunSet(args=cmdstan_args, chains=4) - assert 'bernoulli-' in runset._csv_files[0] - assert '_1.csv' in runset._csv_files[0] - assert '_4.csv' in runset._csv_files[3] + runset = RunSet(args=cmdstan_args, chains=4, one_process_per_chain=True) + + assert all("bernoulli-" in csv_file for csv_file in runset.csv_files) + assert all( + csv_file.endswith(f"_{id}.csv") + for id, csv_file in zip(chain_ids, runset.csv_files) + ) + assert len(runset.stdout_files) == len(chain_ids) + assert all( + stdout_file.endswith(f"_stdout_{id}.txt") + for id, stdout_file in zip(chain_ids, runset.stdout_files) + ) + + cmdstan_args_other_files = CmdStanArgs( + model_name='bernoulli', + model_exe=exe, + chain_ids=chain_ids, + data=jdata, + method_args=sampler_args, + save_latent_dynamics=True, + save_profile=True, + ) + runset_other_files = RunSet( + args=cmdstan_args_other_files, chains=4, one_process_per_chain=True + ) + assert len(runset_other_files.diagnostic_files) == len(chain_ids) + assert all( + diag_file.endswith(f"_diagnostic_{id}.csv") + for id, diag_file in zip(chain_ids, runset_other_files.diagnostic_files) + ) + + assert len(runset_other_files.profile_files) == len(chain_ids) + assert all( + prof_file.endswith(f"_profile_{id}.csv") + for id, prof_file in zip(chain_ids, runset_other_files.profile_files) + ) + + +def test_output_filenames_threading() -> None: + exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) + jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') + sampler_args = SamplerArgs() + chain_ids = [1, 2, 3, 4] + cmdstan_args = CmdStanArgs( + model_name='bernoulli', + model_exe=exe, + chain_ids=chain_ids, + data=jdata, + method_args=sampler_args, + ) + runset = RunSet(args=cmdstan_args, chains=4, one_process_per_chain=False) + + assert all("bernoulli-" in csv_file for csv_file in runset.csv_files) + assert all( + csv_file.endswith(f"_{id}.csv") + for id, csv_file in zip(chain_ids, runset.csv_files) + ) + assert len(runset.stdout_files) == 1 + assert runset.stdout_files[0].endswith("_stdout.txt") + + cmdstan_args_other_files = CmdStanArgs( + model_name='bernoulli', + model_exe=exe, + chain_ids=chain_ids, + data=jdata, + method_args=sampler_args, + save_latent_dynamics=True, + save_profile=True, + ) + runset_other_files = RunSet( + args=cmdstan_args_other_files, chains=4, one_process_per_chain=False + ) + assert len(runset_other_files.diagnostic_files) == len(chain_ids) + assert all( + diag_file.endswith(f"_diagnostic_{id}.csv") + for id, diag_file in zip(chain_ids, runset_other_files.diagnostic_files) + ) + + assert len(runset_other_files.profile_files) == 1 + assert runset_other_files.profile_files[0].endswith("_profile.csv") + + +def test_output_filenames_single_chain() -> None: + exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION) + jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json') + sampler_args = SamplerArgs() + chain_ids = [1] + cmdstan_args = CmdStanArgs( + model_name='bernoulli', + model_exe=exe, + chain_ids=chain_ids, + data=jdata, + method_args=sampler_args, + ) + runset = RunSet(args=cmdstan_args, chains=1, one_process_per_chain=False) + base_file = runset._base_outfile + assert len(runset.csv_files) == 1 + assert len(runset.stdout_files) == 1 + assert runset.csv_files[0].endswith(f"{base_file}.csv") + assert runset.stdout_files[0].endswith(f"{base_file}_stdout.txt") + + runset = RunSet(args=cmdstan_args, chains=1, one_process_per_chain=True) + base_file = runset._base_outfile + assert runset.stdout_files[0].endswith(f"{base_file}_stdout.txt") + + cmdstan_args_other_files = CmdStanArgs( + model_name='bernoulli', + model_exe=exe, + chain_ids=chain_ids, + data=jdata, + method_args=sampler_args, + save_latent_dynamics=True, + save_profile=True, + ) + runset_other_files = RunSet( + args=cmdstan_args_other_files, chains=1, one_process_per_chain=False + ) + assert len(runset_other_files.diagnostic_files) == 1 + assert runset_other_files.diagnostic_files[0].endswith("_diagnostic.csv") + + assert len(runset_other_files.profile_files) == 1 + assert runset_other_files.profile_files[0].endswith("_profile.csv") + + runset_other_files = RunSet( + args=cmdstan_args_other_files, chains=1, one_process_per_chain=True + ) + assert len(runset_other_files.diagnostic_files) == 1 + assert runset_other_files.diagnostic_files[0].endswith("_diagnostic.csv") + + assert len(runset_other_files.profile_files) == 1 + assert runset_other_files.profile_files[0].endswith("_profile.csv") def test_commands() -> None: diff --git a/test/test_sample.py b/test/test_sample.py index 6571dac4..9af7b499 100644 --- a/test/test_sample.py +++ b/test/test_sample.py @@ -2135,7 +2135,7 @@ def test_serialization(stanfile: str = 'bernoulli.stan') -> None: ) # Dump the result (which assembles draws) and delete the source files. dumped = pickle.dumps(bern_fit1) - shutil.rmtree(bern_fit1.runset._output_dir) + shutil.rmtree(bern_fit1.runset._outdir) # Load the serialized result and compare results. bern_fit2: CmdStanMCMC = pickle.loads(dumped) variables1 = bern_fit1.stan_variables() diff --git a/test/test_variational.py b/test/test_variational.py index 66c47826..edc6db4d 100644 --- a/test/test_variational.py +++ b/test/test_variational.py @@ -335,7 +335,7 @@ def test_serialization() -> None: model = CmdStanModel(stan_file=stan) variational1 = model.variational(algorithm='meanfield', seed=999999) dumped = pickle.dumps(variational1) - shutil.rmtree(variational1.runset._output_dir) + shutil.rmtree(variational1.runset._outdir) variational2: CmdStanVB = pickle.loads(dumped) np.testing.assert_array_equal( variational1.variational_sample, variational2.variational_sample