diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a6b8836..3129489e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## v0.5.2 + +- Python code is now formatted with `black` and `isort`, and has been refactored according to `PEP 8` style guides. +- Python code partially commented with `google` docstring format. +- C++ code partially commented with `doxygen` docstring format. +- Scott's and Normal Reference Rule's `bandwidth` calculation have been reordered and commented. +- `ArcOperatorSet::update_incoming_arcs_scores` formulas have been reordered and commented. + ## v0.5.1 - Fixes vcpkg bad hashes ([vcpkg/#38974](https://github.com/microsoft/vcpkg/issues/38974)). @@ -101,7 +109,7 @@ build process is simpler and orchestrated by scikit-build-core and a CMakeLists. - Added a `ProductKDE` class that implements `KDE` with diagonal bandwidth matrix. - Added an abstract class `BandwidthSelector` to implement bandwidth selection for `KDE` and `ProductKDE`. Three concrete implementations of bandwidth selection are included: `ScottsBandwidth`, `NormalReferenceRule` and `UCV`. -- Added `Arguments`, `Args` and `Kwargs` to store a set of arguments to be used to create new factors through +- Added `arguments`, `args` and `kwargs` to store a set of arguments to be used to create new factors through `FactorType::new_factor()`. The `Arguments` are accepted by `BayesianNetworkBase::fit()` and the constructors of `CVLikelihood`, `HoldoutLikelihood` and `ValidatedLikelihood`. @@ -113,8 +121,8 @@ build process is simpler and orchestrated by scikit-build-core and a CMakeLists. ## v0.2.0 - Added conditional linear Gaussian networks (`CLGNetworkType`, `CLGNetwork`, `ConditionalCLGNetwork` and `DynamicCLGNetwork`). -- Implemented `ChiSquare` (and `DynamicChiSquare`) indepencence test. -- Implemented `MutualInformation` (and `DynamicMutualInformation`) indepencence test. This is valid for hybrid data. +- Implemented `ChiSquare` (and `DynamicChiSquare`) independence test. +- Implemented `MutualInformation` (and `DynamicMutualInformation`) independence test. This is valid for hybrid data. - Implemented `BDe` (Bayesian Dirichlet equivalent) score (and `DynamicBDe`). - Added `UnknownFactorType` as default `FactorType` for Bayesian networks when the node type could not be deduced. - Added `Assignment` class to represent the assignment of values to variables. diff --git a/CMakeLists.txt b/CMakeLists.txt index 52f06641..93819cc4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,67 +1,82 @@ cmake_minimum_required(VERSION 3.20.0) -IF(WIN32) +if(WIN32) set(VCPKG_TARGET_TRIPLET x64-windows-static) -ENDIF() - -IF(APPLE) - SET(CMAKE_C_COMPILER "clang") - SET(CMAKE_CXX_COMPILER "clang++") -ENDIF() +endif() + +if(APPLE) + set(CMAKE_C_COMPILER "clang") + set(CMAKE_CXX_COMPILER "clang++") +endif() + +if(UNIX) + set(CMAKE_C_COMPILER "gcc") + set(CMAKE_CXX_COMPILER "g++") +endif() + +# Enable ccache if available +find_program(CCACHE_PROGRAM ccache) +if(CCACHE_PROGRAM) + message(STATUS "ccache found: ${CCACHE_PROGRAM}") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +else() + message(STATUS "ccache not found. Compilation will proceed without caching.") +endif() find_package(Git REQUIRED) message("Git executable: ${GIT_EXECUTABLE}") -IF(EXISTS ".git") - SET(GIT_COMMAND_EXECUTED "{GIT_EXECUTABLE} submodule update --init --recursive") +if(EXISTS ".git") + set(GIT_COMMAND_EXECUTED "{GIT_EXECUTABLE} submodule update --init --recursive") execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE GIT_SUBMOD_RESULT) - IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") - ENDIF() -ELSE() - SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git") + endif() +else() + set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git") execute_process(COMMAND ${GIT_EXECUTABLE} clone https://github.com/Microsoft/vcpkg.git - WORKING_DIRECTORY "." RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY "." RESULT_VARIABLE GIT_SUBMOD_RESULT) - IF(NOT GIT_SUBMOD_RESULT EQUAL "0") + if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") - ENDIF() -ENDIF() + endif() +endif() -SET(GIT_COMMIT_HASH "2024.08.23") +set(GIT_COMMIT_HASH "2024.08.23") -SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") +set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH}") execute_process(COMMAND ${GIT_EXECUTABLE} checkout ${GIT_COMMIT_HASH} - WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT) -IF(NOT GIT_SUBMOD_RESULT EQUAL "0") +if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") -ENDIF() +endif() -SET(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} rev-parse HEAD") +set(GIT_COMMAND_EXECUTED "${GIT_EXECUTABLE} rev-parse HEAD") execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD - WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT OUTPUT_VARIABLE GIT_STDOUT) + WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE GIT_SUBMOD_RESULT OUTPUT_VARIABLE GIT_STDOUT) -IF(NOT GIT_SUBMOD_RESULT EQUAL "0") +if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "${GIT_COMMAND_EXECUTED} failed with ${GIT_SUBMOD_RESULT}.") -ENDIF() +endif() message("Git commit in vcpkg: ${GIT_STDOUT}") set(CMAKE_TOOLCHAIN_FILE "vcpkg/scripts/buildsystems/vcpkg.cmake") project(pybnesian VERSION ${SKBUILD_PROJECT_VERSION} LANGUAGES CXX) -ADD_DEFINITIONS("-DVERSION_INFO=${SKBUILD_PROJECT_VERSION}") +add_definitions("-DVERSION_INFO=${SKBUILD_PROJECT_VERSION}") set(CMAKE_CXX_STANDARD 17) -IF(MSVC) - SET(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - ADD_DEFINITIONS("-DNOGDI") -ENDIF() +if(MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + add_definitions("-DNOGDI") +endif() set(PYBIND11_NEWPYTHON ON) find_package(Python COMPONENTS Interpreter Development) @@ -72,31 +87,35 @@ message("Minor version: ${Python_VERSION_MINOR}") add_definitions(-DPYTHON_VERSION_MAJOR=${Python_VERSION_MAJOR} -DPYTHON_VERSION_MINOR=${Python_VERSION_MINOR}) -IF(WIN32) - SET(SCRIPT_PREFIX "") - SET(SCRIPT_EXTENSION "bat") -ELSEIF(UNIX) - SET(SCRIPT_PREFIX "./") - SET(SCRIPT_EXTENSION "sh") -ENDIF() +if(WIN32) + set(SCRIPT_PREFIX "") + set(SCRIPT_EXTENSION "bat") +elseif(UNIX) + set(SCRIPT_PREFIX "./") + set(SCRIPT_EXTENSION "sh") +endif() + +# Find the Python interpreter +find_package(PythonInterp 3 REQUIRED) -execute_process(COMMAND python expand_sources.py RESULT_VARIABLE EXPAND_SOURCES_RESULT) +# Use the found Python interpreter in the execute_process command +execute_process(COMMAND ${PYTHON_EXECUTABLE} expand_sources.py RESULT_VARIABLE EXPAND_SOURCES_RESULT) -IF(NOT EXPAND_SOURCES_RESULT EQUAL "0") - message(FATAL_ERROR "$python expand_sources.py failed with ${EXPAND_SOURCES_RESULT}") -ENDIF() +if(NOT EXPAND_SOURCES_RESULT EQUAL "0") + message(FATAL_ERROR "${PYTHON_EXECUTABLE} expand_sources.py failed with ${EXPAND_SOURCES_RESULT}") +endif() execute_process(COMMAND ${SCRIPT_PREFIX}bootstrap-vcpkg.${SCRIPT_EXTENSION} WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE VCPKG_BOOTSTRAP_RESULT) -IF(NOT VCPKG_BOOTSTRAP_RESULT EQUAL "0") +if(NOT VCPKG_BOOTSTRAP_RESULT EQUAL "0") message(FATAL_ERROR "${SCRIPT_PREFIX}bootstrap-vcpkg.${SCRIPT_EXTENSION} failed with ${VCPKG_BOOTSTRAP_RESULT}") -ENDIF() +endif() execute_process(COMMAND ${SCRIPT_PREFIX}vcpkg install WORKING_DIRECTORY "vcpkg" RESULT_VARIABLE VCPKG_INSTALL_RESULT) -IF(NOT VCPKG_INSTALL_RESULT EQUAL "0") +if(NOT VCPKG_INSTALL_RESULT EQUAL "0") message(FATAL_ERROR "${SCRIPT_PREFIX}vcpkg install failed with ${VCPKG_INSTALL_RESULT}") -ENDIF() +endif() find_package(Arrow CONFIG REQUIRED) message("Arrow found: ${Arrow_FOUND}") @@ -117,68 +136,70 @@ find_package(Boost REQUIRED COMPONENTS math dynamic_bitset) find_package(OpenCL REQUIRED) pybind11_add_module(__init__ "pybnesian/lib.cpp" - "pybnesian/pybindings/pybindings_dataset.cpp" - "pybnesian/pybindings/pybindings_kde.cpp" - "pybnesian/pybindings/pybindings_factors.cpp" - "pybnesian/pybindings/pybindings_graph.cpp" - "pybnesian/pybindings/pybindings_models.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_learning.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_scores.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_parameters.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_mle.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_operators.cpp" - "pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp" - "pybnesian/kde/KDE.cpp" - "pybnesian/kde/ProductKDE.cpp" - "pybnesian/kde/UCV.cpp" - "pybnesian/factors/continuous/LinearGaussianCPD.cpp" - "pybnesian/factors/continuous/CKDE.cpp" - "pybnesian/factors/discrete/DiscreteFactor.cpp" - "pybnesian/factors/discrete/discrete_indices.cpp" - "pybnesian/dataset/dataset.cpp" - "pybnesian/dataset/dynamic_dataset.cpp" - "pybnesian/dataset/crossvalidation_adaptator.cpp" - "pybnesian/dataset/holdout_adaptator.cpp" - "pybnesian/util/arrow_types.cpp" - "pybnesian/util/bit_util.cpp" - "pybnesian/util/validate_options.cpp" - "pybnesian/util/validate_whitelists.cpp" - "pybnesian/util/temporal.cpp" - "pybnesian/util/rpoly.cpp" - "pybnesian/util/vech_ops.cpp" - "pybnesian/util/pickle.cpp" - "pybnesian/util/util_types.cpp" - "pybnesian/kdtree/kdtree.cpp" - "pybnesian/learning/operators/operators.cpp" - "pybnesian/learning/algorithms/hillclimbing.cpp" - "pybnesian/learning/algorithms/pc.cpp" - "pybnesian/learning/algorithms/mmpc.cpp" - "pybnesian/learning/algorithms/mmhc.cpp" - "pybnesian/learning/algorithms/dmmhc.cpp" - "pybnesian/learning/independences/continuous/linearcorrelation.cpp" - "pybnesian/learning/independences/continuous/mutual_information.cpp" - "pybnesian/learning/independences/continuous/RCoT.cpp" - "pybnesian/learning/independences/discrete/chi_square.cpp" - "pybnesian/learning/independences/hybrid/mutual_information.cpp" - "pybnesian/learning/parameters/mle_LinearGaussianCPD.cpp" - "pybnesian/learning/parameters/mle_DiscreteFactor.cpp" - "pybnesian/learning/scores/bic.cpp" - "pybnesian/learning/scores/bge.cpp" - "pybnesian/learning/scores/bde.cpp" - "pybnesian/learning/scores/cv_likelihood.cpp" - "pybnesian/learning/scores/holdout_likelihood.cpp" - "pybnesian/graph/generic_graph.cpp" - "pybnesian/models/BayesianNetwork.cpp" - "pybnesian/models/GaussianNetwork.cpp" - "pybnesian/models/SemiparametricBN.cpp" - "pybnesian/models/KDENetwork.cpp" - "pybnesian/models/DiscreteBN.cpp" - "pybnesian/models/HomogeneousBN.cpp" - "pybnesian/models/HeterogeneousBN.cpp" - "pybnesian/models/CLGNetwork.cpp" - "pybnesian/models/DynamicBayesianNetwork.cpp" - "pybnesian/opencl/opencl_config.cpp") + "pybnesian/pybindings/pybindings_dataset.cpp" + "pybnesian/pybindings/pybindings_kde.cpp" + "pybnesian/pybindings/pybindings_factors.cpp" + "pybnesian/pybindings/pybindings_graph.cpp" + "pybnesian/pybindings/pybindings_models.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_learning.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_scores.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_parameters.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_mle.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_operators.cpp" + "pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp" + "pybnesian/kde/KDE.cpp" + "pybnesian/kde/ProductKDE.cpp" + "pybnesian/kde/UCV.cpp" + "pybnesian/factors/continuous/LinearGaussianCPD.cpp" + "pybnesian/factors/continuous/CKDE.cpp" + "pybnesian/factors/discrete/DiscreteFactor.cpp" + "pybnesian/factors/discrete/discrete_indices.cpp" + "pybnesian/dataset/dataset.cpp" + "pybnesian/dataset/dynamic_dataset.cpp" + "pybnesian/dataset/crossvalidation_adaptator.cpp" + "pybnesian/dataset/holdout_adaptator.cpp" + "pybnesian/util/arrow_types.cpp" + "pybnesian/util/bit_util.cpp" + "pybnesian/util/validate_options.cpp" + "pybnesian/util/validate_whitelists.cpp" + "pybnesian/util/temporal.cpp" + "pybnesian/util/rpoly.cpp" + "pybnesian/util/vech_ops.cpp" + "pybnesian/util/pickle.cpp" + "pybnesian/util/util_types.cpp" + "pybnesian/kdtree/kdtree.cpp" + "pybnesian/vptree/vptree.cpp" + "pybnesian/learning/operators/operators.cpp" + "pybnesian/learning/algorithms/hillclimbing.cpp" + "pybnesian/learning/algorithms/pc.cpp" + "pybnesian/learning/algorithms/mmpc.cpp" + "pybnesian/learning/algorithms/mmhc.cpp" + "pybnesian/learning/algorithms/dmmhc.cpp" + "pybnesian/learning/independences/continuous/linearcorrelation.cpp" + "pybnesian/learning/independences/continuous/mutual_information.cpp" + "pybnesian/learning/independences/continuous/RCoT.cpp" + "pybnesian/learning/independences/discrete/chi_square.cpp" + "pybnesian/learning/independences/hybrid/mutual_information.cpp" + "pybnesian/learning/independences/hybrid/mixed_knncmi.cpp" + "pybnesian/learning/parameters/mle_LinearGaussianCPD.cpp" + "pybnesian/learning/parameters/mle_DiscreteFactor.cpp" + "pybnesian/learning/scores/bic.cpp" + "pybnesian/learning/scores/bge.cpp" + "pybnesian/learning/scores/bde.cpp" + "pybnesian/learning/scores/cv_likelihood.cpp" + "pybnesian/learning/scores/holdout_likelihood.cpp" + "pybnesian/graph/generic_graph.cpp" + "pybnesian/models/BayesianNetwork.cpp" + "pybnesian/models/GaussianNetwork.cpp" + "pybnesian/models/SemiparametricBN.cpp" + "pybnesian/models/KDENetwork.cpp" + "pybnesian/models/DiscreteBN.cpp" + "pybnesian/models/HomogeneousBN.cpp" + "pybnesian/models/HeterogeneousBN.cpp" + "pybnesian/models/CLGNetwork.cpp" + "pybnesian/models/DynamicBayesianNetwork.cpp" + "pybnesian/opencl/opencl_config.cpp") target_include_directories(__init__ PRIVATE "pybnesian") @@ -187,4 +208,3 @@ target_include_directories(__init__ SYSTEM PRIVATE "lib/eigen-3.3.7" "lib/indica target_link_libraries(__init__ PRIVATE Arrow::arrow_static OpenCL::OpenCL NLopt::nlopt libfort::fort Boost::dynamic_bitset Boost::math) install(TARGETS __init__ LIBRARY DESTINATION ./pybnesian) - diff --git a/INSTALLATION.md b/INSTALLATION.md new file mode 100644 index 00000000..f86cdf54 --- /dev/null +++ b/INSTALLATION.md @@ -0,0 +1,108 @@ +# Installing PyBNesian +Here you can find a detailed installation guide to use PyBNesian including the installation of C++ and GPU tools. + +We acknowledge all the members from Computational Intelligence Group (UPM) for +further discussions related to the installation procedure. + +### Contents +1. [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) +2. [Windows](#windows) +3. [Installation issues](#installation-issues) + +## Ubuntu and Linux sub-systems +PyBNesian uses C++ and OpenCL in the backend to speed up certain computations. +Thus, some software is required to ensure everything works. +Note that, although setting up a Conda environment is usually recommended, it is not mandatory. +The following commands ensure that the C++ and OpenCL requirements are satisfied. + +```bash +sudo apt update +sudo apt install cmake +sudo apt install g++ +sudo apt install opencl-headers +sudo apt install ocl-icd-opencl-dev +``` + +After the previous steps you should be able to install PyBNesian and its dependencies. + +### Installing from source +To install from source, we will download git to be able to download the +repository from GitHub. +```bash +sudo apt install git +``` + +Now, clone the repository, install its dependencies, and install the package. + +```bash +git clone https://github.com/carloslihu/PyBNesian.git +cd PyBNesian +pip install . +``` + +### Installing directly from PyPi +Before installing PyBNesian, ensure that all the dependencies are already installed in your Python environment. + +```bash +pip install PyBNesian +``` + +If no errors were raised, then the software is ready to be used. Otherwise, please +restart the process or raise an issue in the repository. + +## Windows +Sometimes, in order to reduce possible inconvenient regarding Windows OS, +a Linux sub-system is installed (https://learn.microsoft.com/es-es/windows/wsl/install). +If this was the case, please go to [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) section. +Otherwise, please follow the next steps. + +1. Download Visual Studio 2022 from https://visualstudio.microsoft.com/es/vs/ + + 1.1. Download the requirements for C++ +3. Download Visual Studio Build Tools 2022. + +```bash +winget install "Visual Studio Build Tools 2022" +``` + +3. Download developer tools for GPU. + + 3.1. For Nvidia, download Nvidia Toolkit (https://developer.nvidia.com/cuda-downloads) + + 3.2. For Intel, download OneApi (https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html) + +5. Download OpenCL for windows. This guide explains the installation process: https://windowsreport.com/opencl-install-windows-11/ + +6. Install PyBNesian + +### Installing from source +To install from source, we will download git to be able to download the +repository from GitHub. +```bash +sudo apt install git +``` + +Now, clone the repository, install its dependencies, and install the package. + +```bash +git clone https://github.com/carloslihu/PyBNesian.git +cd PyBNesian +pip install . +``` + +### Installing directly from PyPi +Before installing PyBNesian, ensure that all the dependencies are already installed in your Python environment. + +```bash +pip install PyBNesian +``` + +If no errors were raised, then the software is ready to be used. +Otherwise, please restart the process or raise an issue in the repository. + +## Installation issues + +1. If default [Ubuntu and Linux sub-systems](#ubuntu-and-linux-sub-systems) installation +fails, there might be necessary to install GPU toolkits for Linux. +Please, visit https://developer.nvidia.com/cuda-downloads for Nvidia, and +https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html for Intel. \ No newline at end of file diff --git a/README.md b/README.md index eba05f6e..2154b6df 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![build](https://img.shields.io/github/actions/workflow/status/davenza/pybnesian/release.yml) +![build](https://img.shields.io/github/actions/workflow/status/carloslihu/pybnesian/release.yml) [![Documentation Status](https://readthedocs.org/projects/pybnesian/badge/?version=latest)](https://pybnesian.readthedocs.io/en/latest/?badge=latest) ![PyPI](https://img.shields.io/pypi/v/pybnesian?color=blue) @@ -296,6 +296,7 @@ Prerequisites - Git. - OpenCL drivers installed. +We provide a detailed [installation guide](INSTALLATION.md) for these prerequisites of PyBNesian. Building -------- @@ -303,9 +304,9 @@ Building Clone the repository: ``` -git clone https://github.com/davenza/PyBNesian.git +git clone https://github.com/carloslihu/PyBNesian.git cd PyBNesian -git checkout v0.5.1 # You can checkout a specific version if you want +git checkout feature/diagonal-bandwidth # Optional: checkout branch with diagonal bandwidth KDE pip install . ``` diff --git a/conv_template.py b/conv_template.py index 9f13d9b3..75e36029 100644 --- a/conv_template.py +++ b/conv_template.py @@ -4,6 +4,9 @@ # This code is extracted from numpy distutils. # # This code has been extracted to avoid loading numpy.distutils +# +# It is used to generate the C source files from a template. +# PyBNesian uses this code to generate OpenCL code for float and double variables so that there is no dependency on numpy and the OpenCL code. ############################################################### """ @@ -62,18 +65,17 @@ 3, 3, jim """ -__all__ = ['process_str', 'process_file'] +__all__ = ["process_str", "process_file"] import os -import sys import re +import sys # names for replacement that are already global. global_names = {} # header placed at the front of head processed file -header =\ -""" +header = """ /* ***************************************************************************** ** This file was autogenerated from a template DO NOT EDIT!!!! ** @@ -81,16 +83,18 @@ ***************************************************************************** */ """ + + # Parse string for repeat loops def parse_structure(astr, level): """ The returned line number is from the beginning of the string, starting at zero. Returns an empty list if no loops found. """ - if level == 0 : + if level == 0: loopbeg = "/**begin repeat" loopend = "/**end repeat**/" - else : + else: loopbeg = "/**begin repeat%d" % level loopend = "/**end repeat%d**/" % level @@ -105,9 +109,9 @@ def parse_structure(astr, level): start2 = astr.find("\n", start2) fini1 = astr.find(loopend, start2) fini2 = astr.find("\n", fini1) - line += astr.count("\n", ind, start2+1) - spanlist.append((start, start2+1, fini1, fini2+1, line)) - line += astr.count("\n", start2+1, fini2) + line += astr.count("\n", ind, start2 + 1) + spanlist.append((start, start2 + 1, fini1, fini2 + 1, line)) + line += astr.count("\n", start2 + 1, fini2) ind = fini2 spanlist.sort() return spanlist @@ -116,10 +120,13 @@ def parse_structure(astr, level): def paren_repl(obj): torep = obj.group(1) numrep = obj.group(2) - return ','.join([torep]*int(numrep)) + return ",".join([torep] * int(numrep)) + parenrep = re.compile(r"\(([^)]*)\)\*(\d+)") plainrep = re.compile(r"([^*]+)\*(\d+)") + + def parse_values(astr): # replaces all occurrences of '(a,b,c)*4' in astr # with 'a,b,c,a,b,c,a,b,c,a,b,c'. Empty braces generate @@ -127,16 +134,17 @@ def parse_values(astr): # split at ',' and a list of values returned. astr = parenrep.sub(paren_repl, astr) # replaces occurrences of xxx*3 with xxx, xxx, xxx - astr = ','.join([plainrep.sub(paren_repl, x.strip()) - for x in astr.split(',')]) - return astr.split(',') + astr = ",".join([plainrep.sub(paren_repl, x.strip()) for x in astr.split(",")]) + return astr.split(",") stripast = re.compile(r"\n\s*\*?") named_re = re.compile(r"#\s*(\w*)\s*=([^#]*)#") exclude_vars_re = re.compile(r"(\w*)=(\w*)") exclude_re = re.compile(":exclude:") -def parse_loop_header(loophead) : + + +def parse_loop_header(loophead): """Find all named replacements in the header Returns a list of dictionaries, one for each loop iteration, where each key is a name to be substituted and the corresponding @@ -157,86 +165,91 @@ def parse_loop_header(loophead) : name = rep[0] vals = parse_values(rep[1]) size = len(vals) - if nsub is None : + if nsub is None: nsub = size - elif nsub != size : + elif nsub != size: msg = "Mismatch in number of values, %d != %d\n%s = %s" raise ValueError(msg % (nsub, size, name, vals)) names.append((name, vals)) - # Find any exclude variables excludes = [] for obj in exclude_re.finditer(loophead): span = obj.span() # find next newline - endline = loophead.find('\n', span[1]) - substr = loophead[span[1]:endline] + endline = loophead.find("\n", span[1]) + substr = loophead[span[1] : endline] ex_names = exclude_vars_re.findall(substr) excludes.append(dict(ex_names)) # generate list of dictionaries, one for each template iteration dlist = [] - if nsub is None : + if nsub is None: raise ValueError("No substitution variables found") for i in range(nsub): tmp = {name: vals[i] for name, vals in names} dlist.append(tmp) return dlist + replace_re = re.compile(r"@(\w+)@") -def parse_string(astr, env, level, line) : + + +def parse_string(astr, env, level, line): lineno = "#line %d\n" % line # local function for string replacement, uses env def replace(match): name = match.group(1) - try : + try: val = env[name] except KeyError: - msg = 'line %d: no definition of key "%s"'%(line, name) + msg = 'line %d: no definition of key "%s"' % (line, name) raise ValueError(msg) from None return val code = [lineno] struct = parse_structure(astr, level) - if struct : + if struct: # recurse over inner loops oldend = 0 newlevel = level + 1 for sub in struct: - pref = astr[oldend:sub[0]] - head = astr[sub[0]:sub[1]] - text = astr[sub[1]:sub[2]] + pref = astr[oldend : sub[0]] + head = astr[sub[0] : sub[1]] + text = astr[sub[1] : sub[2]] oldend = sub[3] newline = line + sub[4] code.append(replace_re.sub(replace, pref)) - try : + try: envlist = parse_loop_header(head) except ValueError as e: msg = "line %d: %s" % (newline, e) raise ValueError(msg) - for newenv in envlist : + for newenv in envlist: newenv.update(env) newcode = parse_string(text, newenv, newlevel, newline) code.extend(newcode) suff = astr[oldend:] code.append(replace_re.sub(replace, suff)) - else : + else: # replace keys code.append(replace_re.sub(replace, astr)) - code.append('\n') - return ''.join(code) + code.append("\n") + return "".join(code) + def process_str(astr): code = [header] code.extend(parse_string(astr, global_names, 0, 1)) - return ''.join(code) + return "".join(code) + +include_src_re = re.compile( + r"(\n|\A)#include\s*['\"]" r"(?P[\w\d./\\]+[.]src)['\"]", re.I +) -include_src_re = re.compile(r"(\n|\A)#include\s*['\"]" - r"(?P[\w\d./\\]+[.]src)['\"]", re.I) def resolve_includes(source): d = os.path.dirname(source) @@ -245,11 +258,11 @@ def resolve_includes(source): for line in fid: m = include_src_re.match(line) if m: - fn = m.group('name') + fn = m.group("name") if not os.path.isabs(fn): fn = os.path.join(d, fn) if os.path.isfile(fn): - print('Including file', fn) + print("Including file", fn) lines.extend(resolve_includes(fn)) else: lines.append(line) @@ -257,17 +270,18 @@ def resolve_includes(source): lines.append(line) return lines + def process_file(source): lines = resolve_includes(source) sourcefile = os.path.normcase(source).replace("\\", "\\\\") try: - code = process_str(''.join(lines)) + code = process_str("".join(lines)) except ValueError as e: raise ValueError('In "%s" loop at %s' % (sourcefile, e)) from None return '#line 1 "%s"\n%s' % (sourcefile, code) -def unique_key(adict): +def unique_key(adict: dict) -> str: # this obtains a unique key given a dictionary # currently it works by appending together n of the letters of the # current keys and increasing n until a unique key is found @@ -275,6 +289,7 @@ def unique_key(adict): allkeys = list(adict.keys()) done = False n = 1 + newkey = "" while not done: newkey = "".join([x[:n] for x in allkeys]) if newkey in allkeys: @@ -285,16 +300,17 @@ def unique_key(adict): def main(): + file = None try: file = sys.argv[1] except IndexError: fid = sys.stdin outfile = sys.stdout else: - fid = open(file, 'r') + fid = open(file, "r") (base, ext) = os.path.splitext(file) newname = base - outfile = open(newname, 'w') + outfile = open(newname, "w") allstr = fid.read() try: @@ -304,5 +320,6 @@ def main(): outfile.write(writestr) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/docs/source/api/learning/independences.rst b/docs/source/api/learning/independences.rst index dee80b18..1a52211f 100644 --- a/docs/source/api/learning/independences.rst +++ b/docs/source/api/learning/independences.rst @@ -76,4 +76,11 @@ Bibliography 938–947. .. [RCoT] Strobl, E. V., Zhang, K., & Visweswaran, S. (2019). Approximate kernel-based conditional independence tests - for fast non-parametric causal discovery. Journal of Causal Inference, 7(1). \ No newline at end of file + for fast non-parametric causal discovery. Journal of Causal Inference, 7(1). + +.. [MSCMI] [1] Mesner, O. C. and Shalizi C. R. (2021) Conditional mutual information estimation for mixed, discrete and + continuous data. IEEE Transactions on Information Theory, 67(1), 464–484. + +.. [MixedCMIKnn] [1] Popescu, O.-I., Gerhardus, A. & Runge, J. (2023). Non-parametric conditional independence testing for + mixed continuous-categorical variables: A novel method and numerical evaluation. arXiv pre-print. + Available: https://arxiv.org/abs/2310.11132 \ No newline at end of file diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 38042515..9e3e0b29 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -2,6 +2,15 @@ Changelog ********* +v0.5.2 +====== + +- Python code is now formatted with ``black`` and ``isort``, and has been refactored according to ``PEP 8`` style guides. +- Python code partially commented with ``google`` docstring format. +- C++ code partially commented with ``doxygen`` docstring format. +- Scott's and Normal Reference Rule's ``bandwidth`` calculation have been reordered and commented. +- ``ArcOperatorSet::update_incoming_arcs_scores`` formulas have been reordered and commented. + v0.5.1 ====== diff --git a/docs/source/conf.py b/docs/source/conf.py index 39539f5e..418e188b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,13 +17,13 @@ # -- Project information ----------------------------------------------------- -project = 'PyBNesian' -copyright = '2024, David Atienza' -author = 'David Atienza' +project = "PyBNesian" +copyright = "2024, David Atienza" +author = "David Atienza, Carlos Li Hu" # The full version, including alpha/beta/rc tags -version = '0.5.1' -release = '0.5.1' +version = "0.5.2" +release = "0.5.2" # -- General configuration --------------------------------------------------- @@ -32,20 +32,21 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx.ext.doctest', - 'sphinx.ext.mathjax', - 'sphinx_rtd_theme'] + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.doctest", + "sphinx.ext.mathjax", + "sphinx_rtd_theme", +] autosummary_generate = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] -source_suffix = '.rst' +source_suffix = ".rst" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -53,13 +54,13 @@ exclude_patterns = [] # Removes the module prefix of the class definition -#add_module_names = False +# add_module_names = False intersphinx_mapping = { - 'pyarrow': ('https://arrow.apache.org/docs/', None), - 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'numpy': ('https://numpy.org/doc/stable/', None), - 'pickle': ('https://docs.python.org/3/', None) + "pyarrow": ("https://arrow.apache.org/docs/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pickle": ("https://docs.python.org/3/", None), } # -- Options for HTML output ------------------------------------------------- @@ -67,9 +68,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ["_static"] diff --git a/expand_sources.py b/expand_sources.py index 1850d01f..455db27f 100644 --- a/expand_sources.py +++ b/expand_sources.py @@ -1,35 +1,37 @@ import os + import conv_template def expand_sources(): - sources = ['pybnesian/kde/opencl_kernels/KDE.cl.src'] - + sources = ["pybnesian/kde/opencl_kernels/KDE.cl.src"] + for source in sources: (base, _) = os.path.splitext(source) outstr = conv_template.process_file(source) - with open(base, 'w') as fid: + with open(base, "w") as fid: fid.write(outstr) def copy_opencl_code(): - sources = ['pybnesian/kde/opencl_kernels/KDE.cl'] + sources = ["pybnesian/kde/opencl_kernels/KDE.cl"] # Split the CPP code because the MSVC only allow strings of a max size. # Error C2026: https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026?view=msvc-160 - MAX_LENGTH=16378 + MAX_LENGTH = 16378 code_str = "" for source in sources: - code_str += '\n' + code_str += "\n" with open(source) as f: source_code = f.read() code_str += source_code - fragments = [code_str[i:(i + MAX_LENGTH)] for i in range(0, len(code_str), MAX_LENGTH)] + fragments = [ + code_str[i : (i + MAX_LENGTH)] for i in range(0, len(code_str), MAX_LENGTH) + ] - cpp_code = \ - """#ifndef PYBNESIAN_OPENCL_OPENCL_CODE_HPP + cpp_code = """#ifndef PYBNESIAN_OPENCL_OPENCL_CODE_HPP #define PYBNESIAN_OPENCL_OPENCL_CODE_HPP namespace opencl { @@ -42,10 +44,10 @@ def copy_opencl_code(): } #endif //PYBNESIAN_OPENCL_OPENCL_CODE_HPP""" - with open('pybnesian/opencl/opencl_code.hpp', 'w') as f: + with open("pybnesian/opencl/opencl_code.hpp", "w") as f: f.write(cpp_code) if __name__ == "__main__": expand_sources() - copy_opencl_code() \ No newline at end of file + copy_opencl_code() diff --git a/lib/eigen-3.3.7/debug/gdb/printers.py b/lib/eigen-3.3.7/debug/gdb/printers.py index 0d67a5f9..4869e948 100644 --- a/lib/eigen-3.3.7/debug/gdb/printers.py +++ b/lib/eigen-3.3.7/debug/gdb/printers.py @@ -9,14 +9,14 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # Pretty printers for Eigen::Matrix -# This is still pretty basic as the python extension to gdb is still pretty basic. +# This is still pretty basic as the python extension to gdb is still pretty basic. # It cannot handle complex eigen types and it doesn't support any of the other eigen types -# Such as quaternion or some other type. +# Such as quaternion or some other type. # This code supports fixed size as well as dynamic size matrices # To use it: # -# * Create a directory and put the file as well as an empty __init__.py in +# * Create a directory and put the file as well as an empty __init__.py in # that directory. # * Create a ~/.gdbinit file, that contains the following: # python @@ -26,189 +26,214 @@ # register_eigen_printers (None) # end -import gdb import re -import itertools + +import gdb # type: ignore class EigenMatrixPrinter: - "Print Eigen Matrix or Array of some kind" - - def __init__(self, variety, val): - "Extract all the necessary information" - - # Save the variety (presumably "Matrix" or "Array") for later usage - self.variety = variety - - # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() - tag = self.type.tag - regex = re.compile('\<.*\>') - m = regex.findall(tag)[0][1:-1] - template_params = m.split(',') - template_params = [x.replace(" ", "") for x in template_params] - - if template_params[1] == '-0x00000000000000001' or template_params[1] == '-0x000000001' or template_params[1] == '-1': - self.rows = val['m_storage']['m_rows'] - else: - self.rows = int(template_params[1]) - - if template_params[2] == '-0x00000000000000001' or template_params[2] == '-0x000000001' or template_params[2] == '-1': - self.cols = val['m_storage']['m_cols'] - else: - self.cols = int(template_params[2]) - - self.options = 0 # default value - if len(template_params) > 3: - self.options = template_params[3]; - - self.rowMajor = (int(self.options) & 0x1) - - self.innerType = self.type.template_argument(0) - - self.val = val - - # Fixed size matrices have a struct as their storage, so we need to walk through this - self.data = self.val['m_storage']['m_data'] - if self.data.type.code == gdb.TYPE_CODE_STRUCT: - self.data = self.data['array'] - self.data = self.data.cast(self.innerType.pointer()) - - class _iterator: - def __init__ (self, rows, cols, dataPtr, rowMajor): - self.rows = rows - self.cols = cols - self.dataPtr = dataPtr - self.currentRow = 0 - self.currentCol = 0 - self.rowMajor = rowMajor - - def __iter__ (self): - return self - - def next(self): - return self.__next__() # Python 2.x compatibility - - def __next__(self): - - row = self.currentRow - col = self.currentCol - if self.rowMajor == 0: - if self.currentCol >= self.cols: - raise StopIteration - - self.currentRow = self.currentRow + 1 - if self.currentRow >= self.rows: - self.currentRow = 0 - self.currentCol = self.currentCol + 1 - else: - if self.currentRow >= self.rows: - raise StopIteration - - self.currentCol = self.currentCol + 1 - if self.currentCol >= self.cols: - self.currentCol = 0 - self.currentRow = self.currentRow + 1 - - - item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - if (self.cols == 1): #if it's a column vector - return ('[%d]' % (row,), item) - elif (self.rows == 1): #if it's a row vector - return ('[%d]' % (col,), item) - return ('[%d,%d]' % (row, col), item) - - def children(self): - - return self._iterator(self.rows, self.cols, self.data, self.rowMajor) - - def to_string(self): - return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % (self.variety, self.innerType, self.rows, self.cols, "RowMajor" if self.rowMajor else "ColMajor", self.data) + "Print Eigen Matrix or Array of some kind" + + def __init__(self, variety, val): + "Extract all the necessary information" + + # Save the variety (presumably "Matrix" or "Array") for later usage + self.variety = variety + + # The gdb extension does not support value template arguments - need to extract them by hand + type = val.type + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + self.type = type.unqualified().strip_typedefs() + tag = self.type.tag + regex = re.compile(r"\<.*\>") + m = regex.findall(tag)[0][1:-1] + template_params = m.split(",") + template_params = [x.replace(" ", "") for x in template_params] + + if ( + template_params[1] == "-0x00000000000000001" + or template_params[1] == "-0x000000001" + or template_params[1] == "-1" + ): + self.rows = val["m_storage"]["m_rows"] + else: + self.rows = int(template_params[1]) + + if ( + template_params[2] == "-0x00000000000000001" + or template_params[2] == "-0x000000001" + or template_params[2] == "-1" + ): + self.cols = val["m_storage"]["m_cols"] + else: + self.cols = int(template_params[2]) + + self.options = 0 # default value + if len(template_params) > 3: + self.options = template_params[3] + + self.rowMajor = int(self.options) & 0x1 + + self.innerType = self.type.template_argument(0) + + self.val = val + + # Fixed size matrices have a struct as their storage, so we need to walk through this + self.data = self.val["m_storage"]["m_data"] + if self.data.type.code == gdb.TYPE_CODE_STRUCT: + self.data = self.data["array"] + self.data = self.data.cast(self.innerType.pointer()) + + class _iterator: + def __init__(self, rows, cols, dataPtr, rowMajor): + self.rows = rows + self.cols = cols + self.dataPtr = dataPtr + self.currentRow = 0 + self.currentCol = 0 + self.rowMajor = rowMajor + + def __iter__(self): + return self + + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): + + row = self.currentRow + col = self.currentCol + if self.rowMajor == 0: + if self.currentCol >= self.cols: + raise StopIteration + + self.currentRow = self.currentRow + 1 + if self.currentRow >= self.rows: + self.currentRow = 0 + self.currentCol = self.currentCol + 1 + else: + if self.currentRow >= self.rows: + raise StopIteration + + self.currentCol = self.currentCol + 1 + if self.currentCol >= self.cols: + self.currentCol = 0 + self.currentRow = self.currentRow + 1 + + item = self.dataPtr.dereference() + self.dataPtr = self.dataPtr + 1 + if self.cols == 1: # if it's a column vector + return ("[%d]" % (row,), item) + elif self.rows == 1: # if it's a row vector + return ("[%d]" % (col,), item) + return ("[%d,%d]" % (row, col), item) + + def children(self): + + return self._iterator(self.rows, self.cols, self.data, self.rowMajor) + + def to_string(self): + return "Eigen::%s<%s,%d,%d,%s> (data ptr: %s)" % ( + self.variety, + self.innerType, + self.rows, + self.cols, + "RowMajor" if self.rowMajor else "ColMajor", + self.data, + ) + class EigenQuaternionPrinter: - "Print an Eigen Quaternion" - - def __init__(self, val): - "Extract all the necessary information" - # The gdb extension does not support value template arguments - need to extract them by hand - type = val.type - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - self.type = type.unqualified().strip_typedefs() - self.innerType = self.type.template_argument(0) - self.val = val - - # Quaternions have a struct as their storage, so we need to walk through this - self.data = self.val['m_coeffs']['m_storage']['m_data']['array'] - self.data = self.data.cast(self.innerType.pointer()) - - class _iterator: - def __init__ (self, dataPtr): - self.dataPtr = dataPtr - self.currentElement = 0 - self.elementNames = ['x', 'y', 'z', 'w'] - - def __iter__ (self): - return self - - def next(self): - return self.__next__() # Python 2.x compatibility - - def __next__(self): - element = self.currentElement - - if self.currentElement >= 4: #there are 4 elements in a quanternion - raise StopIteration - - self.currentElement = self.currentElement + 1 - - item = self.dataPtr.dereference() - self.dataPtr = self.dataPtr + 1 - return ('[%s]' % (self.elementNames[element],), item) - - def children(self): - - return self._iterator(self.data) - - def to_string(self): - return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data) - -def build_eigen_dictionary (): - pretty_printers_dict[re.compile('^Eigen::Quaternion<.*>$')] = lambda val: EigenQuaternionPrinter(val) - pretty_printers_dict[re.compile('^Eigen::Matrix<.*>$')] = lambda val: EigenMatrixPrinter("Matrix", val) - pretty_printers_dict[re.compile('^Eigen::Array<.*>$')] = lambda val: EigenMatrixPrinter("Array", val) + "Print an Eigen Quaternion" + + def __init__(self, val): + "Extract all the necessary information" + # The gdb extension does not support value template arguments - need to extract them by hand + type = val.type + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + self.type = type.unqualified().strip_typedefs() + self.innerType = self.type.template_argument(0) + self.val = val + + # Quaternions have a struct as their storage, so we need to walk through this + self.data = self.val["m_coeffs"]["m_storage"]["m_data"]["array"] + self.data = self.data.cast(self.innerType.pointer()) + + class _iterator: + def __init__(self, dataPtr): + self.dataPtr = dataPtr + self.currentElement = 0 + self.elementNames = ["x", "y", "z", "w"] + + def __iter__(self): + return self + + def next(self): + return self.__next__() # Python 2.x compatibility + + def __next__(self): + element = self.currentElement + + if self.currentElement >= 4: # there are 4 elements in a quanternion + raise StopIteration + + self.currentElement = self.currentElement + 1 + + item = self.dataPtr.dereference() + self.dataPtr = self.dataPtr + 1 + return ("[%s]" % (self.elementNames[element],), item) + + def children(self): + + return self._iterator(self.data) + + def to_string(self): + return "Eigen::Quaternion<%s> (data ptr: %s)" % (self.innerType, self.data) + + +def build_eigen_dictionary(): + pretty_printers_dict[re.compile("^Eigen::Quaternion<.*>$")] = ( + lambda val: EigenQuaternionPrinter(val) + ) + pretty_printers_dict[re.compile("^Eigen::Matrix<.*>$")] = ( + lambda val: EigenMatrixPrinter("Matrix", val) + ) + pretty_printers_dict[re.compile("^Eigen::Array<.*>$")] = ( + lambda val: EigenMatrixPrinter("Array", val) + ) + def register_eigen_printers(obj): - "Register eigen pretty-printers with objfile Obj" + "Register eigen pretty-printers with objfile Obj" + + if obj == None: + obj = gdb + obj.pretty_printers.append(lookup_function) - if obj == None: - obj = gdb - obj.pretty_printers.append(lookup_function) def lookup_function(val): - "Look-up and return a pretty-printer that can print va." - - type = val.type - - if type.code == gdb.TYPE_CODE_REF: - type = type.target() - - type = type.unqualified().strip_typedefs() - - typename = type.tag - if typename == None: - return None - - for function in pretty_printers_dict: - if function.search(typename): - return pretty_printers_dict[function](val) - - return None + "Look-up and return a pretty-printer that can print va." + + type = val.type + + if type.code == gdb.TYPE_CODE_REF: + type = type.target() + + type = type.unqualified().strip_typedefs() + + typename = type.tag + if typename == None: + return None + + for function in pretty_printers_dict: + if function.search(typename): + return pretty_printers_dict[function](val) + + return None + pretty_printers_dict = {} -build_eigen_dictionary () +build_eigen_dictionary() diff --git a/lib/eigen-3.3.7/scripts/relicense.py b/lib/eigen-3.3.7/scripts/relicense.py index 8a5265f1..1179db00 100644 --- a/lib/eigen-3.3.7/scripts/relicense.py +++ b/lib/eigen-3.3.7/scripts/relicense.py @@ -11,7 +11,7 @@ # # Make the long-awaited conversion to MPL. -lgpl3_header = ''' +lgpl3_header = """ // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -30,7 +30,7 @@ // You should have received a copy of the GNU Lesser General Public // License and a copy of the GNU General Public License along with // Eigen. If not, see . -''' +""" mpl2_header = """ // This Source Code Form is subject to the terms of the Mozilla @@ -41,29 +41,29 @@ import os import sys -exclusions = set(['relicense.py']) +exclusions = set(["relicense.py"]) + def update(text): - if text.find(lgpl3_header) == -1: - return text, False - return text.replace(lgpl3_header, mpl2_header), True + if text.find(lgpl3_header) == -1: + return text, False + return text.replace(lgpl3_header, mpl2_header), True + rootdir = sys.argv[1] for root, sub_folders, files in os.walk(rootdir): for basename in files: - if basename in exclusions: - print 'SKIPPED', filename - continue filename = os.path.join(root, basename) - fo = file(filename) - text = fo.read() - fo.close() + if basename in exclusions: + print("SKIPPED", filename) + continue + with open(filename, "r") as fo: + text = fo.read() text, updated = update(text) if updated: - fo = file(filename, "w") - fo.write(text) - fo.close() - print 'UPDATED', filename + with open(filename, "w") as fo: + fo.write(text) + print("UPDATED", filename) else: - print ' ', filename + print(" ", filename) diff --git a/pybnesian/dataset/dataset.cpp b/pybnesian/dataset/dataset.cpp index 6df14d07..aef80da9 100644 --- a/pybnesian/dataset/dataset.cpp +++ b/pybnesian/dataset/dataset.cpp @@ -69,13 +69,13 @@ struct ArrowSchema* extract_pycapsule_schema(py::handle pyobject) { throw pybind11::attribute_error("Method __arrow_c_schema__ not found."); } - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 PyObject* schema_capsule_obj = PyObject_CallNoArgs(arrow_c_method); - #else +#else PyObject* args = PyTuple_New(0); PyObject* schema_capsule_obj = PyObject_Call(arrow_c_method, args, NULL); Py_DECREF(args); - #endif +#endif Py_DECREF(arrow_c_method); // extract the capsule @@ -94,13 +94,13 @@ struct ArrowCAPIObjects extract_pycapsule_array(py::handle pyobject) { throw pybind11::attribute_error("Method __arrow_c_array__ not found."); } - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 PyObject* array_capsule_tuple = PyObject_CallNoArgs(arrow_c_method); - #else +#else PyObject* args = PyTuple_New(0); PyObject* array_capsule_tuple = PyObject_Call(arrow_c_method, args, NULL); Py_DECREF(args); - #endif +#endif Py_DECREF(arrow_c_method); @@ -197,6 +197,13 @@ std::vector DataFrame::column_names() const { return names; } +/** + * @brief Returns the number of null elements in the array. + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return int64_t Number of null elements. + */ int64_t null_count(Array_iterator begin, Array_iterator end) { int64_t r = 0; for (auto it = begin; it != end; it++) { @@ -204,7 +211,13 @@ int64_t null_count(Array_iterator begin, Array_iterator end) { } return r; } - +/** + * @brief Returns the combined bitmap of the columns. + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return Buffer_ptr Combined bitmap. + */ Buffer_ptr combined_bitmap(Array_iterator begin, Array_iterator end) { if (null_count(begin, end) > 0) { Array_iterator first_null_col = end; @@ -233,7 +246,13 @@ Buffer_ptr combined_bitmap(Array_iterator begin, Array_iterator end) { return nullptr; } } - +/** + * @brief Returns the number of valid rows in the columns (The dataframe may have non-valid rows?). + * + * @param begin Iterator to the first element of the array. + * @param end Iterator to the last element of the array. + * @return int64_t Number of valid rows. + */ int64_t valid_rows(Array_iterator begin, Array_iterator end) { if (std::distance(begin, end) == 0) { return 0; diff --git a/pybnesian/dataset/dataset.hpp b/pybnesian/dataset/dataset.hpp index 8c3fa813..7db48a45 100644 --- a/pybnesian/dataset/dataset.hpp +++ b/pybnesian/dataset/dataset.hpp @@ -2244,13 +2244,13 @@ struct type_caster> { PyObject* method_py = method.ptr(); - #ifdef Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 +#if Python_MAJOR_VERSION == 3 && Python_MINOR_VERSION >= 9 py::handle casted = PyObject_CallOneArg(method_py, schema_capsule); - #else +#else PyObject* args = PyTuple_Pack(1, schema_capsule); py::handle casted = PyObject_Call(method_py, args, NULL); Py_DECREF(args); - #endif +#endif return casted; } diff --git a/pybnesian/dataset/holdout_adaptator.hpp b/pybnesian/dataset/holdout_adaptator.hpp index 9f11a39b..284d6178 100644 --- a/pybnesian/dataset/holdout_adaptator.hpp +++ b/pybnesian/dataset/holdout_adaptator.hpp @@ -49,7 +49,7 @@ class HoldOut { if (test_rows == 0 || train_rows == 0) { throw std::invalid_argument("Wrong test_ratio (" + std::to_string(test_ratio) + - "selected for HoldOut.\n" + ") selected for HoldOut.\n" "Generated train instances: " + std::to_string(train_rows) + "\n" diff --git a/pybnesian/factors/continuous/CKDE.cpp b/pybnesian/factors/continuous/CKDE.cpp index c0286c0e..3eabe5f4 100644 --- a/pybnesian/factors/continuous/CKDE.cpp +++ b/pybnesian/factors/continuous/CKDE.cpp @@ -40,6 +40,11 @@ std::shared_ptr CKDEType::new_factor(const ConditionalBayesianNetworkBas return generic_new_factor(variable, evidence, args, kwargs); } +/** + * @brief Public function to learn the CKDE parameters given the data. + * + * @param df Data. + */ void CKDE::fit(const DataFrame& df) { auto type = df.same_type(m_variables); diff --git a/pybnesian/factors/continuous/CKDE.hpp b/pybnesian/factors/continuous/CKDE.hpp index f179ab7d..053880e0 100644 --- a/pybnesian/factors/continuous/CKDE.hpp +++ b/pybnesian/factors/continuous/CKDE.hpp @@ -179,6 +179,12 @@ class CKDE : public Factor { KDE m_marg; }; +/** + * @brief Private function to learn the CKDE parameters given the data. + * + * @tparam ArrowType Arrow Data type. + * @param df Data. + */ template void CKDE::_fit(const DataFrame& df) { m_joint.fit(df); diff --git a/pybnesian/factors/discrete/DiscreteFactor.cpp b/pybnesian/factors/discrete/DiscreteFactor.cpp index 35142b6d..b0584db4 100644 --- a/pybnesian/factors/discrete/DiscreteFactor.cpp +++ b/pybnesian/factors/discrete/DiscreteFactor.cpp @@ -210,6 +210,7 @@ Array_ptr DiscreteFactor::sample(int n, const DataFrame& evidence_values, unsign std::string DiscreteFactor::ToString() const { std::stringstream stream; stream << std::setprecision(3); + // Evidence refers to the parents if (!evidence().empty()) { const auto& e = evidence(); stream << "[DiscreteFactor] P(" << variable() << " | " << e[0]; diff --git a/pybnesian/kde/KDE.cpp b/pybnesian/kde/KDE.cpp index fb6133fe..bb35a9fe 100644 --- a/pybnesian/kde/KDE.cpp +++ b/pybnesian/kde/KDE.cpp @@ -39,7 +39,11 @@ DataFrame KDE::training_data() const { throw std::invalid_argument("Unreachable code."); } } - +/** + * @brief Learns the KDE parameters from the given data. + * + * @param df Data. + */ void KDE::fit(const DataFrame& df) { m_training_type = df.same_type(m_variables); @@ -67,6 +71,12 @@ void KDE::fit(const DataFrame& df) { m_fitted = true; } +/** + * @brief Public function to calculate the log-likelihood vector of the given data. + * + * @param df Data. + * @return VectorXd Log-likelihood vector. + */ VectorXd KDE::logl(const DataFrame& df) const { check_fitted(); auto type = df.same_type(m_variables); @@ -85,6 +95,12 @@ VectorXd KDE::logl(const DataFrame& df) const { } } +/** + * @brief Public function to calculate the log-likelihood sum of the given data. + * + * @param df Data. + * @return double Log-likelihood sum. + */ double KDE::slogl(const DataFrame& df) const { check_fitted(); auto type = df.same_type(m_variables); diff --git a/pybnesian/kde/KDE.hpp b/pybnesian/kde/KDE.hpp index 049078f7..249847ef 100644 --- a/pybnesian/kde/KDE.hpp +++ b/pybnesian/kde/KDE.hpp @@ -13,7 +13,27 @@ using opencl::OpenCLConfig, opencl::OpenCL_kernel_traits; namespace kde { +/** + * @brief Class for calculating the Univariate Kernel Density Estimation. + + * + */ struct UnivariateKDE { + /** + * @brief Executes the log-likelihood calculation for a univariate KDE model. + * + * @tparam ArrowType Arrow data type. + * @param training_vec Training data. + * @param training_length Number of training instances. + * @param test_vec Test data. + * @param int Unused. + * @param test_offset ? + * @param test_length Number of test instances. + * @param int Unused. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param output_mat Output matrix. + */ template void static execute_logl_mat(const cl::Buffer& training_vec, const unsigned int training_length, @@ -26,6 +46,7 @@ struct UnivariateKDE { const typename ArrowType::c_type lognorm_const, cl::Buffer&, cl::Buffer& output_mat); + template static void execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer&, @@ -40,6 +61,21 @@ struct UnivariateKDE { cl::Buffer& output_mat); }; +/** + * @brief Executes the log-likelihood calculation for a univariate KDE model for each variable. + * + * @tparam ArrowType Arrow data type. + * @param training_vec Training data. + * @param training_length Number of training instances. + * @param test_vec Test data. + * @param int Unused. + * @param test_offset + * @param test_length Number of test instances. + * @param int Unused. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param output_mat Output matrix. + */ template void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, const unsigned int training_length, @@ -53,6 +89,22 @@ void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, cl::Buffer&, cl::Buffer& output_mat) { auto& opencl = OpenCLConfig::get(); + // TODO: This is the kernel that is executed, might be wrong? + // OpenCL kernel for calculating the log-likelihood values for each test instance + // __kernel void logl_values_1d_mat_double(__global double *restrict train_vector, + // __private uint train_rows, + // __global double *restrict test_vector, + // __private uint test_offset, + // __constant double *standard_deviation, + // __private double lognorm_factor, + // __global double *restrict result) { + // int i = get_global_id(0); + // int train_idx = ROW(i, train_rows); + // int test_idx = COL(i, train_rows); + // double d = (train_vector[train_idx] - test_vector[test_offset + test_idx]) / standard_deviation[0]; + + // result[i] = (-0.5*d*d) + lognorm_factor; + // } auto& k_logl_values_1d_mat = opencl.kernel(OpenCL_kernel_traits::logl_values_1d_mat); k_logl_values_1d_mat.setArg(0, training_vec); k_logl_values_1d_mat.setArg(1, training_length); @@ -61,11 +113,14 @@ void UnivariateKDE::execute_logl_mat(const cl::Buffer& training_vec, k_logl_values_1d_mat.setArg(4, cholesky); k_logl_values_1d_mat.setArg(5, lognorm_const); k_logl_values_1d_mat.setArg(6, output_mat); + auto& queue = opencl.queue(); + // ? Calculates the log-likelihood values for each test instance RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_logl_values_1d_mat, cl::NullRange, cl::NDRange(training_length * test_length), cl::NullRange)); } +// Computes conditional mu. template void UnivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer&, @@ -88,6 +143,8 @@ void UnivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, k_conditional_means_1d.setArg(5, transform_mean); k_conditional_means_1d.setArg(6, output_mat); auto& queue = opencl.queue(); + + // ? Calculates the log-likelihood values for each test instance RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_conditional_means_1d, cl::NullRange, cl::NDRange(training_rows * test_length), cl::NullRange)); } @@ -119,7 +176,22 @@ struct MultivariateKDE { cl::Buffer& tmp_mat, cl::Buffer& output_mat); }; - +/** + * @brief Executes the log-likelihood calculation for a multivariate KDE model for each variable. + * + * @tparam ArrowType Arrow data type. + * @param training_mat Training data. + * @param training_rows Number of training instances. + * @param test_mat Test data. + * @param test_physical_rows Number of test instances. + * @param test_offset ? + * @param test_length Number of test instances. + * @param matrices_cols Number of columns of the matrices. + * @param cholesky Cholesky decomposition of the bandwidth matrix. + * @param lognorm_const log-likelihood constant. + * @param tmp_mat Temporary matrix. + * @param output_mat Output matrix. + */ template void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, const unsigned int training_rows, @@ -134,19 +206,55 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, cl::Buffer& output_mat) { auto& opencl = OpenCLConfig::get(); + // __kernel void substract_double(__global double* restrict training_matrix, + // __private uint training_physical_rows, + // __private uint training_offset, + // __private uint training_rows, + // __global double* restrict test_matrix, + // __private uint test_physical_rows, + // __private uint test_offset, + // __private uint test_row_idx, + // __global double* restrict res) { + // uint i = get_global_id(0); + // uint r = ROW(i, training_rows) + training_offset; + // uint c = COL(i, training_rows); + // res[i] = test_matrix[IDX(test_offset + test_row_idx, c, test_physical_rows)] - + // training_matrix[IDX(r, c, training_physical_rows)]; + // } auto& k_substract = opencl.kernel(OpenCL_kernel_traits::substract); + // __kernel void solve_double(__global double* restrict diff_matrix, + // __private uint diff_matrix_rows, + // __private uint matrices_cols, + // __global double* restrict cholesky_matrix) { + // uint r = get_global_id(0); + + // for (uint c = 0; c < matrices_cols; c++) { + // for (uint i = 0; i < c; i++) { + // diff_matrix[IDX(r, c, diff_matrix_rows)] -= + // cholesky_matrix[IDX(c, i, matrices_cols)] * diff_matrix[IDX(r, i, diff_matrix_rows)]; + // } + // diff_matrix[IDX(r, c, diff_matrix_rows)] /= cholesky_matrix[IDX(c, c, matrices_cols)]; + // } + // } auto& k_solve = opencl.kernel(OpenCL_kernel_traits::solve); k_solve.setArg(0, tmp_mat); k_solve.setArg(2, matrices_cols); k_solve.setArg(3, cholesky); + // __kernel void square_double(__global double* restrict m) { + // uint idx = get_global_id(0); + // double d = m[idx]; + // m[idx] = d * d; + // } auto& k_square = opencl.kernel(OpenCL_kernel_traits::square); k_square.setArg(0, tmp_mat); auto& queue = opencl.queue(); - if (training_rows > test_length) { + if (training_rows > + test_length) { // When the number of training instances is greater than the number of test instances + // Test Matrix - Training Matrix k_substract.setArg(0, training_mat); k_substract.setArg(1, training_rows); k_substract.setArg(2, 0u); @@ -165,14 +273,18 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, k_logl_values_mat.setArg(3, training_rows); k_logl_values_mat.setArg(5, lognorm_const); + // NOTE: Calculates the log-likelihood values for each test instance for (unsigned int i = 0; i < test_length; ++i) { k_substract.setArg(7, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_substract, cl::NullRange, cl::NDRange(training_rows * matrices_cols), cl::NullRange)); + RAISE_ENQUEUEKERNEL_ERROR( queue.enqueueNDRangeKernel(k_solve, cl::NullRange, cl::NDRange(training_rows), cl::NullRange)); + RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_square, cl::NullRange, cl::NDRange(training_rows * matrices_cols), cl::NullRange)); + k_logl_values_mat.setArg(4, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( k_logl_values_mat, cl::NullRange, cl::NDRange(training_rows), cl::NullRange)); @@ -196,6 +308,7 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, k_logl_values_mat.setArg(3, training_rows); k_logl_values_mat.setArg(5, lognorm_const); + // ? Calculates the log-likelihood values for each test instance for (unsigned int i = 0; i < training_rows; ++i) { k_substract.setArg(7, i); RAISE_ENQUEUEKERNEL_ERROR(queue.enqueueNDRangeKernel( @@ -210,7 +323,7 @@ void MultivariateKDE::execute_logl_mat(const cl::Buffer& training_mat, } } } - +// Computes conditional mu. template void MultivariateKDE::execute_conditional_means(const cl::Buffer& joint_training, const cl::Buffer& marg_training, @@ -447,25 +560,38 @@ DataFrame KDE::_training_data() const { auto rb = arrow::RecordBatch::Make(schema, N, columns); return DataFrame(rb); } - +/** + * @brief Private function to learn the KDE parameters given the training data. + * Used in the public function fit in KDE.cpp. + * + * @tparam ArrowType Arrow data type. + * @tparam contains_null Boolean indicating if the training data contains null values. + * @param df Training data. + */ template void KDE::_fit(const DataFrame& df) { using CType = typename ArrowType::c_type; auto d = m_variables.size(); + // NOTE: Here the positive definiteness of the bandwidth is checked + try { + m_bandwidth = m_bselector->bandwidth(df, m_variables); + } catch (util::singular_covariance_data& e) { + std::cerr << "KDE::_fit:\t" << e.what() << std::endl; + throw e; + } - m_bandwidth = m_bselector->bandwidth(df, m_variables); - + // Calculates the LLT decomposition matrix of the bandwidth matrix auto llt_cov = m_bandwidth.llt(); - auto llt_matrix = llt_cov.matrixLLT(); + auto cholesky = llt_cov.matrixLLT(); auto& opencl = OpenCLConfig::get(); if constexpr (std::is_same_v) { - m_H_cholesky = opencl.copy_to_buffer(llt_matrix.data(), d * d); + m_H_cholesky = opencl.copy_to_buffer(cholesky.data(), d * d); } else { using MatrixType = Matrix; - MatrixType casted_cholesky = llt_matrix.template cast(); + MatrixType casted_cholesky = cholesky.template cast(); m_H_cholesky = opencl.copy_to_buffer(casted_cholesky.data(), d * d); } @@ -473,10 +599,22 @@ void KDE::_fit(const DataFrame& df) { N = training_data->rows(); m_training = opencl.copy_to_buffer(training_data->data(), N * d); - m_lognorm_const = - -llt_matrix.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); + // NOTE: The determinant of the bandwidth matrix is the product of the diagonal elements of the cholesky + // - log(|h|) - 1/2 * d * log(2 * pi) - log(N) + m_lognorm_const = -cholesky.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); } +/** + * @brief Learns the KDE parameters given the bandwidth matrix, the training data, the training type (?) and the number + * of training instances. + * + * @tparam ArrowType Arrow data type. + * @tparam EigenMatrix Eigen matrix type. + * @param bandwidth Bandwidth matrix. + * @param training_data Training data. + * @param training_type Training type. + * @param training_instances Number of training instances. + */ template void KDE::fit(EigenMatrix bandwidth, cl::Buffer training_data, @@ -506,10 +644,19 @@ void KDE::fit(EigenMatrix bandwidth, m_training = training_data; m_training_type = training_type; N = training_instances; + + // NOTE: The determinant of the bandwidth matrix is the product of the diagonal elements of the cholesky m_lognorm_const = -cholesky.diagonal().array().log().sum() - 0.5 * d * std::log(2 * util::pi) - std::log(N); m_fitted = true; } +/** + * @brief Calculates Log-likelihood of the given data with OpenCL. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @return VectorXd Log-likelihood values. + */ template VectorXd KDE::_logl(const DataFrame& df) const { using CType = typename ArrowType::c_type; @@ -517,14 +664,14 @@ VectorXd KDE::_logl(const DataFrame& df) const { auto logl_buff = logl_buffer(df); auto& opencl = OpenCLConfig::get(); - if (df.null_count(m_variables) == 0) { + if (df.null_count(m_variables) == 0) { // No null variables -> Returns the data? VectorType read_data(df->num_rows()); opencl.read_from_buffer(read_data.data(), logl_buff, df->num_rows()); if constexpr (!std::is_same_v) return read_data.template cast(); else return read_data; - } else { + } else { // Null variables -> Returns the data without nulls auto m = df.valid_rows(m_variables); VectorType read_data(m); auto bitmap = df.combined_bitmap(m_variables); @@ -560,7 +707,13 @@ double KDE::_slogl(const DataFrame& df) const { opencl.read_from_buffer(&result, buffer_sum, 1); return static_cast(result); } - +/** + * @brief Calculates the log-likelihood of the given data using _logl_impl. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::logl_buffer(const DataFrame& df) const { auto& opencl = OpenCLConfig::get(); @@ -575,6 +728,14 @@ cl::Buffer KDE::logl_buffer(const DataFrame& df) const { return _logl_impl(test_buffer, m); } +/** + * @brief Calculates the log-likelihood of the given data using _logl_impl. + * + * @tparam ArrowType Arrow data type. + * @param df Data. + * @param bitmap Bitmap. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::logl_buffer(const DataFrame& df, Buffer_ptr& bitmap) const { auto& opencl = OpenCLConfig::get(); @@ -589,6 +750,15 @@ cl::Buffer KDE::logl_buffer(const DataFrame& df, Buffer_ptr& bitmap) const { return _logl_impl(test_buffer, m); } +/** + * @brief Function where the log-likelihood are calculated with OpenCL?. + * + * @tparam ArrowType Arrow data type. + * @tparam KDEType KDE type. + * @param test_buffer Test data. + * @param m Number of test instances. + * @return cl::Buffer Log-likelihood values. + */ template cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { using CType = typename ArrowType::c_type; @@ -619,6 +789,7 @@ cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { m_lognorm_const, tmp_mat_buffer, mat_logls); + // Calculates the log-likelihood values for each test instance opencl.logsumexp_cols_offset(mat_logls, N, allocated_m, res, i * allocated_m); } auto remaining_m = m - (iterations - 1) * allocated_m; @@ -634,6 +805,7 @@ cl::Buffer KDE::_logl_impl(cl::Buffer& test_buffer, int m) const { m_lognorm_const, tmp_mat_buffer, mat_logls); + // Calculates the log-likelihood values for each test instance opencl.logsumexp_cols_offset(mat_logls, N, remaining_m, res, (iterations - 1) * allocated_m); return res; diff --git a/pybnesian/kde/NormalReferenceRule.hpp b/pybnesian/kde/NormalReferenceRule.hpp index 2da850b7..dd890624 100644 --- a/pybnesian/kde/NormalReferenceRule.hpp +++ b/pybnesian/kde/NormalReferenceRule.hpp @@ -9,13 +9,22 @@ namespace kde { class NormalReferenceRule : public BandwidthSelector { public: + /** + * @brief Public function for calculating the diagonal bandwidth matrix using the Normal Reference Rule given the + * data and variables. + * + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return VectorXd(0); size_t valid_rows = df.valid_rows(variables); - if (valid_rows <= variables.size()) { + if (valid_rows <= variables.size()) { // If the number of (valid) rows is less than the number of variables std::stringstream ss; - ss << "Diagonal bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "NormalReferenceRule::diag_bandwidth -> Diagonal bandwidth matrix of " + << std::to_string(variables.size()) << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -30,17 +39,28 @@ class NormalReferenceRule : public BandwidthSelector { case Type::FLOAT: return diag_bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "NormalReferenceRule::diag_bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data " + "is expected."); } } - + /** + * @brief Public function for calculating the bandwidth matrix using the Normal Reference Rule given the data and + * variables. + * + * @param df Data + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return MatrixXd(0, 0); auto valid_rows = df.valid_rows(variables); - if (static_cast(valid_rows) <= variables.size()) { + if (static_cast(valid_rows) <= + variables.size()) { // If the number of (valid) rows is less than the number of variables std::stringstream ss; - ss << "Bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "NormalReferenceRule::bandwidth -> Bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -50,12 +70,15 @@ class NormalReferenceRule : public BandwidthSelector { } switch (df.same_type(variables)->id()) { + // Here the bandwidth is calculated using the function defined later in the private section. case Type::DOUBLE: return bandwidth(df, variables); case Type::FLOAT: return bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "NormalReferenceRule::bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } @@ -68,25 +91,34 @@ class NormalReferenceRule : public BandwidthSelector { } private: + /** + * @brief Private function to calculate the diagonal bandwidth matrix using the Normal Reference Rule given the data + * and variables. If the covariance matrix is not positive definite, an exception is thrown. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ template VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; auto cov_ptr = df.cov(variables); auto& cov = *cov_ptr; - - if (!util::is_psd(cov)) { - std::stringstream ss; - ss << "Covariance matrix for variables [" << variables[0]; - for (size_t i = 1; i < variables.size(); ++i) { - ss << ", " << variables[i]; - } - ss << "] is not positive-definite."; - throw util::singular_covariance_data(ss.str()); - } - + // NOTE: UNNECESSARY CHECK + // if (!util::is_psd(cov)) { + // std::stringstream ss; + // ss << "NormalReferenceRule::diag_bandwidth -> Covariance matrix for variables [" << variables[0]; + // for (size_t i = 1; i < variables.size(); ++i) { + // ss << ", " << variables[i]; + // } + // ss << "] is not positive-definite."; + // throw util::singular_covariance_data(ss.str()); + // } + // The covariance diagonal is used to calculate the bandwidth auto diag = cov.diagonal(); - auto delta = (cov.array().colwise() * diag.cwiseInverse().array()).matrix(); + auto delta = (cov.array().colwise() * diag.cwiseInverse().array()).matrix(); // diag(cov)^ (-1) * cov auto delta_inv = delta.inverse(); auto N = static_cast(df.valid_rows(variables)); @@ -94,42 +126,63 @@ class NormalReferenceRule : public BandwidthSelector { auto delta_inv_trace = delta_inv.trace(); - // Estimate bandwidth using Equation (3.4) of Chacon and Duong (2018) + // NOTE: Estimate bandwidth using Equation (3.4) of Chacon and Duong (2018) + // [4*d*sqrt(det(delta))] / + // / [(2*trace(delta^(-1)*delta^(-1)) + trace(delta^(-1))^2) * N] auto k = 4 * d * std::sqrt(delta.determinant()) / - (2 * (delta_inv * delta_inv).trace() + delta_inv_trace * delta_inv_trace); - + ((2 * (delta_inv * delta_inv).trace() + delta_inv_trace * delta_inv_trace) * N); + auto k2 = std::pow(k, 2. / (d + 4.)); if constexpr (std::is_same_v) { - return std::pow(k / N, 2. / (d + 4.)) * diag; + return k2 * diag; } else { - return (std::pow(k / N, 2. / (d + 4.)) * diag).template cast(); + return (k2 * diag).template cast(); } } - + /** + * @brief Private function to calculate the bandwidth matrix using the Normal Reference Rule given the data and + * variables. If the covariance matrix is not positive definite, an exception is thrown. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ template MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; - auto cov = df.cov(variables); + auto cov_ptr = df.cov(variables); + auto& cov = *cov_ptr; - if (!util::is_psd(*cov)) { - std::stringstream ss; - ss << "Covariance matrix for variables [" << variables[0]; - for (size_t i = 1; i < variables.size(); ++i) { - ss << ", " << variables[i]; - } - ss << "] is not positive-definite."; - throw util::singular_covariance_data(ss.str()); - } + // NOTE: UNNECESSARY CHECK + // if (!util::is_psd(cov)) { + // std::stringstream ss; + // ss << "Covariance matrix for variables [" << variables[0]; + // for (size_t i = 1; i < variables.size(); ++i) { + // ss << ", " << variables[i]; + // } + // ss << "] is not positive-definite."; + // throw util::singular_covariance_data(ss.str()); + // } + // TODO: OPTIMIZE THIS + // We put the non-diagonal elements to zero + // for (auto i = 0; i < cov.rows(); ++i) { + // for (auto j = 0; j < cov.cols(); ++j) { + // if (i != j) { + // cov(i, j) = 0; + // } + // } + // } auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); - + // Normal Reference Rule formula squared for the bandwidth auto k = std::pow(4. / (N * (d + 2.)), 2. / (d + 4)); if constexpr (std::is_same_v) { - return k * (*cov); + return k * cov; } else { - return k * cov->template cast(); + return (k * cov).template cast(); } } }; diff --git a/pybnesian/kde/ProductKDE.hpp b/pybnesian/kde/ProductKDE.hpp index ca211beb..d75de9c0 100644 --- a/pybnesian/kde/ProductKDE.hpp +++ b/pybnesian/kde/ProductKDE.hpp @@ -1,11 +1,11 @@ #ifndef PYBNESIAN_KDE_PRODUCTKDE_HPP #define PYBNESIAN_KDE_PRODUCTKDE_HPP -#include #include #include #include #include +#include using opencl::OpenCLConfig, opencl::OpenCL_kernel_traits; @@ -165,6 +165,12 @@ void ProductKDE::_fit(const DataFrame& df) { auto& opencl = OpenCLConfig::get(); + // NOTE: Here the positive definiteness of the bandwidth is checked + // if bandwidth is not positive definite, + // - try to add a small value to the diagonal? + // m_bandwidth = m_bandwidth + VectorXd::Constant(m_variables.size(), 1e-6); + + // - Add to blacklist and ignore this iteration? m_bandwidth = m_bselector->diag_bandwidth(df, m_variables); for (size_t i = 0; i < m_variables.size(); ++i) { @@ -184,7 +190,7 @@ void ProductKDE::_fit(const DataFrame& df) { m_training.push_back(opencl.copy_to_buffer(column->data(), N)); } } - + // -1/2 * d * log(2 * pi) - 1/2 * log(|h|) - log(N) m_lognorm_const = -0.5 * static_cast(m_variables.size()) * std::log(2 * util::pi) - 0.5 * m_bandwidth.array().log().sum() - std::log(N); } diff --git a/pybnesian/kde/ScottsBandwidth.hpp b/pybnesian/kde/ScottsBandwidth.hpp index fa30604d..45f8a896 100644 --- a/pybnesian/kde/ScottsBandwidth.hpp +++ b/pybnesian/kde/ScottsBandwidth.hpp @@ -5,13 +5,22 @@ namespace kde { class ScottsBandwidth : public BandwidthSelector { public: + /** + * @brief Public function for calculating the diagonal bandwidth matrix using Scott's Rule given the data and + * variables. + * + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return VectorXd(0); size_t valid_rows = df.valid_rows(variables); if (valid_rows <= 1) { std::stringstream ss; - ss << "Diagonal bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "ScottsBandwidth::diag_bandwidth -> Diagonal bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -26,17 +35,26 @@ class ScottsBandwidth : public BandwidthSelector { case Type::FLOAT: return diag_bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "ScottsBandwidth::diag_bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } - + /** + * @brief Public function for calculating the bandwidth matrix using Scott's Rule given the data and variables. + * + * @param df Data + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const override { if (variables.empty()) return MatrixXd(0, 0); size_t valid_rows = df.valid_rows(variables); if (valid_rows <= variables.size()) { std::stringstream ss; - ss << "Bandwidth matrix of " << std::to_string(variables.size()) << " variables [" << variables[0]; + ss << "ScottsBandwidth::bandwidth -> Bandwidth matrix of " << std::to_string(variables.size()) + << " variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { ss << ", " << variables[i]; } @@ -51,7 +69,9 @@ class ScottsBandwidth : public BandwidthSelector { case Type::FLOAT: return bandwidth(df, variables); default: - throw std::invalid_argument("Wrong data type to fit bandwidth. [double] or [float] data is expected."); + throw std::invalid_argument( + "ScottsBandwidth::bandwidth -> Wrong data type to fit bandwidth. [double] or [float] data is " + "expected."); } } @@ -62,6 +82,15 @@ class ScottsBandwidth : public BandwidthSelector { static std::shared_ptr __setstate__(py::tuple&) { return std::make_shared(); } private: + /** + * @brief Private function for calculating the diagonal bandwidth matrix using Scott's Rule given the data and + * variables + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return VectorXd Diagonal bandwidth vector. + */ template VectorXd diag_bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; @@ -86,14 +115,21 @@ class ScottsBandwidth : public BandwidthSelector { return bandwidth; } - + /** + * @brief Private function for calculating the bandwidth matrix using Scott's Rule given the data and variables. + * + * @tparam ArrowType Arrow Data type. + * @param df Dataframe. + * @param variables Variables. + * @return MatrixXd Bandwidth matrix. + */ template MatrixXd bandwidth(const DataFrame& df, const std::vector& variables) const { using CType = typename ArrowType::c_type; - auto cov = df.cov(variables); - - if (!util::is_psd(*cov)) { + auto cov_ptr = df.cov(variables); + auto& cov = *cov_ptr; + if (!util::is_psd(cov)) { std::stringstream ss; ss << "Covariance matrix for variables [" << variables[0]; for (size_t i = 1; i < variables.size(); ++i) { @@ -102,16 +138,26 @@ class ScottsBandwidth : public BandwidthSelector { ss << "] is not positive-definite."; throw util::singular_covariance_data(ss.str()); } + // TODO: OPTIMIZE THIS + // We put the non-diagonal elements to zero + // for (auto i = 0; i < cov.rows(); ++i) { + // for (auto j = 0; j < cov.cols(); ++j) { + // if (i != j) { + // cov(i, j) = 0; + // } + // } + // } auto N = static_cast(df.valid_rows(variables)); auto d = static_cast(variables.size()); + // Scott's Rule formula auto k = std::pow(N, -2. / (d + 4)); if constexpr (std::is_same_v) { - return k * (*cov); + return k * cov; } else { - return k * cov->template cast(); + return (k * cov).template cast(); } } }; diff --git a/pybnesian/kde/opencl_kernels/KDE.cl.src b/pybnesian/kde/opencl_kernels/KDE.cl.src index d41098b0..259eb474 100644 --- a/pybnesian/kde/opencl_kernels/KDE.cl.src +++ b/pybnesian/kde/opencl_kernels/KDE.cl.src @@ -1,4 +1,5 @@ -/* This code assumes column major data for matrices. */ +/* This file contains opencl code for matrix operations. +It assumes column major data for matrices. */ #define IDX(i, j, rows) (i) + ((j)*(rows)) #define ROW(idx, rows) (idx) % (rows) diff --git a/pybnesian/learning/algorithms/hillclimbing.cpp b/pybnesian/learning/algorithms/hillclimbing.cpp index 130bbadf..e21a2703 100644 --- a/pybnesian/learning/algorithms/hillclimbing.cpp +++ b/pybnesian/learning/algorithms/hillclimbing.cpp @@ -23,6 +23,35 @@ using util::ArcStringVector; namespace learning::algorithms { +/** + * @brief Executes a greedy hill-climbing algorithm for Bayesian network structure learning. This calls + GreedyHillClimbing.estimate(). + * + * @param df DataFrame used to learn a Bayesian network model. + * @param bn_type BayesianNetworkType of the returned model. If start is given, bn_type is ignored. Defaults to + * pbn.SemiparametricBNType(). + * @param start Initial structure of the GreedyHillClimbing. If None, a new Bayesian network model is created. Defaults + * to None. + * @param score_str A string representing the score used to drive the search. + The possible options are: “bic” for BIC, “bge” for BGe, “cv-lik” for CVLikelihood, “holdout-lik” for + HoldoutLikelihood, “validated-lik" for ValidatedLikelihood. Defaults to "validated-lik". + * @param operators_str Set of operators in the search process. Defaults to ["arcs", "node_type"]. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). Defaults to []. + * @param arc_whitelist List of arcs whitelist (forced arcs). Defaults to []. + * @param type_blacklist List of type blacklist (forbidden types). Defaults to []. + * @param type_whitelist List of type whitelist (forced types). Defaults to []. + * @param callback Callback object that is called after each iteration. Defaults to None. + * @param max_indegree Maximum indegree allowed in the graph. Defaults to 0. + * @param max_iters Maximum number of search iterations. Defaults to 2147483647. + * @param epsilon Minimum delta score allowed for each operator. If the new operator is less than epsilon, the search + process is stopped. Defaults to 0. + * @param patience The patience parameter (only used with pbn.ValidatedScore). Defaults to 0. + * @param seed Seed parameter of the score (if needed). Defaults to None. + * @param num_folds Number of folds for the CVLikelihood and ValidatedLikelihood scores. Defaults to 10. + * @param test_holdout_ratio Parameter for the HoldoutLikelihood and ValidatedLikelihood scores. Defaults to 0.2. + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. Defaults to 0. + * @return std::shared_ptr The estimated Bayesian network structure. + */ std::shared_ptr hc(const DataFrame& df, const std::shared_ptr bn_type, const std::shared_ptr start, @@ -44,7 +73,7 @@ std::shared_ptr hc(const DataFrame& df, if (!bn_type && !start) { throw std::invalid_argument("\"bn_type\" or \"start\" parameter must be specified."); } - + // If seed is not given, it is set to a random value. auto iseed = [seed]() { if (seed) return *seed; @@ -52,6 +81,7 @@ std::shared_ptr hc(const DataFrame& df, return std::random_device{}(); }(); + // If bn_type is not given, it is set to the type of the given start model. const auto& bn_type_ = [&start, &bn_type]() -> const BayesianNetworkType& { if (start) return start->type_ref(); @@ -59,11 +89,14 @@ std::shared_ptr hc(const DataFrame& df, return *bn_type; }(); + // Checks if the given operators are valid for the given Bayesian network type ["arcs", "node_type"]. auto operators = util::check_valid_operators( bn_type_, operators_str, arc_blacklist, arc_whitelist, max_indegree, type_whitelist); + // If max_iters is 0, it is set to the maximum integer value. if (max_iters == 0) max_iters = std::numeric_limits::max(); + // If start is given, it is used as the initial model. Otherwise, a new model is created. const auto start_model = [&start, &bn_type_, &df]() -> const std::shared_ptr { if (start) return start; @@ -72,8 +105,9 @@ std::shared_ptr hc(const DataFrame& df, }(); GreedyHillClimbing hc; - auto score = util::check_valid_score(df, bn_type_, score_str, iseed, num_folds, test_holdout_ratio); + // If score is not given, it is set to the default score for the given Bayesian network type. + auto score = util::check_valid_score(df, bn_type_, score_str, iseed, num_folds, test_holdout_ratio); return hc.estimate(*operators, *score, *start_model, diff --git a/pybnesian/learning/algorithms/hillclimbing.hpp b/pybnesian/learning/algorithms/hillclimbing.hpp index a98900cf..75bfabea 100644 --- a/pybnesian/learning/algorithms/hillclimbing.hpp +++ b/pybnesian/learning/algorithms/hillclimbing.hpp @@ -42,7 +42,16 @@ std::shared_ptr hc(const DataFrame& df, int num_folds, double test_holdout_ratio, int verbose = 0); - +/** + * @brief Calculates the validation delta score for each of the variables. + * + * @tparam T Type of the Bayesian network. + * @param model Bayesian network. + * @param val_score Validated score. + * @param variables List of variables. + * @param current_local_scores Local score cache. + * @return double The validation delta score. + */ template double validation_delta_score(const T& model, const ValidatedScore& val_score, @@ -58,7 +67,28 @@ double validation_delta_score(const T& model, return nnew - prev; } - +/** + * @brief Executes a greedy hill-climbing algorithm for Bayesian network structure learning. + * + * @tparam zero_patience True if patience == 0, False otherwise. + * @tparam S Type of the score. + * @tparam T Type of the Bayesian network. + * @param op_set Set of operators in the search process. + * @param score Score that drives the search. + * @param start Initial structure. A BayesianNetworkBase or ConditionalBayesianNetworkBase. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). + * @param arc_whitelist List of arcs whitelist (forced arcs). + * @param type_blacklist List of type blacklist (forbidden pbn.FactorType). + * @param type_whitelist List of type whitelist (forced pbn.FactorType). + * @param callback Callback object that is called after each iteration. + * @param max_indegree Maximum indegree allowed in the graph. + * @param max_iters Maximum number of search iterations. + * @param epsilon Minimum delta score allowed for each operator. If (best_op->delta() - epsilon) < util::machine_tol, + * then the search process is stopped. + * @param patience The patience parameter (only used with ValidatedScore). + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. + * @return std::shared_ptr The estimated Bayesian network structure of the same type as start. + */ template std::shared_ptr estimate_hc(OperatorSet& op_set, S& score, @@ -73,131 +103,216 @@ std::shared_ptr estimate_hc(OperatorSet& op_set, double epsilon, int patience, int verbose) { - auto spinner = util::indeterminate_spinner(verbose); - spinner->update_status("Checking dataset..."); - - auto current_model = start.clone(); - current_model->force_type_whitelist(type_whitelist); - - if (current_model->has_unknown_node_types()) { - auto score_data = score.data(); - - if (score_data->num_columns() == 0) { - throw std::invalid_argument( - "The score does not have data to detect the node types. Set the node types for" - " all the nodes in the Bayesian network or use an score that uses data (it implements Score::data)."); - } - - score_data.raise_has_columns(current_model->nodes()); - current_model->set_unknown_node_types(score_data, type_blacklist); - } - - current_model->check_blacklist(arc_blacklist); - current_model->force_whitelist(arc_whitelist); - - op_set.set_arc_blacklist(arc_blacklist); - op_set.set_arc_whitelist(arc_whitelist); - op_set.set_type_blacklist(type_blacklist); - op_set.set_type_whitelist(type_whitelist); - op_set.set_max_indegree(max_indegree); - - auto prev_current_model = current_model->clone(); - auto best_model = current_model; - - spinner->update_status("Caching scores..."); + std::string log_str = "HILL-CLIMBING::estimate_hc:\t"; + try { + util::formatted_log_t(verbose, log_str + "Begins"); + // We copy the arc_blacklist + // auto arc_blacklist_copy = arc_blacklist; + + // Spinner for the progress bar + auto spinner = util::indeterminate_spinner(verbose); + spinner->update_status("Checking dataset..."); + + // Model initialization + auto current_model = start.clone(); + // Model type validation + current_model->force_type_whitelist(type_whitelist); + if (current_model->has_unknown_node_types()) { + auto score_data = score.data(); + + if (score_data->num_columns() == 0) { + throw std::invalid_argument( + "The score does not have data to detect the node types. Set the node types for" + " all the nodes in the Bayesian network or use an score that uses data (it implements " + "Score::data)."); + } - LocalScoreCache local_validation = [&]() { - if constexpr (std::is_base_of_v) { - LocalScoreCache lc(*current_model); - lc.cache_vlocal_scores(*current_model, score); - return lc; - } else if constexpr (std::is_base_of_v) { - return LocalScoreCache{}; - } else { - static_assert(util::always_false, "Wrong Score class for hill-climbing."); + score_data.raise_has_columns(current_model->nodes()); + current_model->set_unknown_node_types(score_data, type_blacklist); } - }(); - - op_set.cache_scores(*current_model, score); - int p = 0; - double accumulated_offset = 0; - - OperatorTabuSet tabu_set; - - if (callback) callback->call(*current_model, nullptr, score, 0); - - auto iter = 0; - while (iter < max_iters) { - ++iter; - - auto best_op = [&]() { - if constexpr (zero_patience) - return op_set.find_max(*current_model); - else - return op_set.find_max(*current_model, tabu_set); + // Model arc validation + current_model->check_blacklist( + arc_blacklist); // Checks whether the arc_blacklist is valid for the current_model + current_model->force_whitelist(arc_whitelist); // Include the given whitelisted arcs. It checks the validity of + // the graph after including the arc whitelist. + + // OperatorSet initialization + op_set.set_arc_blacklist(arc_blacklist); + op_set.set_arc_whitelist(arc_whitelist); + op_set.set_type_blacklist(type_blacklist); + op_set.set_type_whitelist(type_whitelist); + op_set.set_max_indegree(max_indegree); + + // Search model initialization + auto prev_current_model = current_model->clone(); + auto best_model = current_model; + + spinner->update_status("Caching scores..."); + + // NOTE: Here the score of each node is calculated (log-likelihood fit) + // Partiendo de que se empieza con los nodos sin padres, se calcula el score independiente, y da que no es 0 + // Opciones: + // 1. Se calcula el score independiente, y tras fallar el fit de log-likelihood se hace regularización? + // 2. Se elimina la variable? + // 3. Hacer try-catch para que cuando de error, se añada regularización al score + + // TODO: Peta si hay variables sueltas con varianza 0 al hacer cross-validation -> Arreglar? + // Initializes the local validation scores for the current model + util::formatted_log_t(verbose, log_str + "Local Validation TBC"); + LocalScoreCache local_validation = [&]() { // Local validation scores (lambda expression) + if constexpr (std::is_base_of_v) { // If the score is a ValidatedScore + LocalScoreCache lc(*current_model); // Local score cache + lc.cache_vlocal_scores(*current_model, score); // Cache the local scores + return lc; + } else if constexpr (std::is_base_of_v) { // If the score is a generic Score + return LocalScoreCache{}; + } else { + static_assert(util::always_false, "Wrong Score class for hill-climbing."); + } }(); - if (!best_op || (best_op->delta() - epsilon) < util::machine_tol) { - break; - } + util::formatted_log_t(verbose, log_str + "Local Validation Calculated"); + // Cache scores + util::formatted_log_t(verbose, log_str + "op_set.cache_scores TBC"); + // Caches the delta score values of each operator in the set. + op_set.cache_scores(*current_model, score); + int p = 0; + double accumulated_offset = 0; + + util::formatted_log_t(verbose, log_str + "Scores cached"); + OperatorTabuSet tabu_set; + + if (callback) callback->call(*current_model, nullptr, score, 0); + util::formatted_log_t(verbose, log_str + "Hill climbing iterations begin"); + // Hill climbing iterations begin + auto iter = 0; + while (iter < max_iters) { + ++iter; + // Finds the best operator + // HC Algorithm lines 8 -> 16 [Atienza et al. (2022)] + // NOTE: Here the best operators are evaluated (log-likelihood fit) + util::formatted_log_t(verbose, log_str + "Best operator TBC"); + auto best_op = [&]() { + if constexpr (zero_patience) + return op_set.find_max(*current_model); + else + return op_set.find_max(*current_model, tabu_set); + }(); + + if (!best_op || (best_op->delta() - epsilon) < util::machine_tol) { + util::formatted_log_t(verbose, log_str + "No improvement in best_op"); + break; + } + util::formatted_log_t(verbose, log_str + "Best operator Calculated" + best_op->ToString()); + // If the best operator is nullptr or the delta is less than epsilon, then the search process fails and + // stops + + // S_validation puede pasar try { Algorithm lines 17 -> 24 [Atienza et al. (2022)] Applies the best operator + // to the current model + best_op->apply(*current_model); + // Returns the nodes changed by the best operator + auto nodes_changed = best_op->nodes_changed(*current_model); + + // Calculates the validation delta + util::formatted_log_t(verbose, log_str + "Validation Delta TBC"); + + double validation_delta = [&]() { + if constexpr (std::is_base_of_v) { + return validation_delta_score(*current_model, score, nodes_changed, local_validation); + } else { + return best_op->delta(); + } + }(); + util::formatted_log_t(verbose, log_str + "Validation Delta Calculated"); + // Updates the best model if the validation delta is greater than 0 + if ((validation_delta + accumulated_offset) > + util::machine_tol) { // If the validation delta is greater than 0, then the current model is the best + // model + util::formatted_log_t(verbose, log_str + "Validation Delta is greater than 0"); + if constexpr (!zero_patience) { + if (p > 0) { + best_model = current_model; + p = 0; + accumulated_offset = 0; + } + + tabu_set.clear(); + } + } else { // If the validation delta is less than 0, then the current model is not the best model + util::formatted_log_t(verbose, log_str + "Validation Delta is less than 0"); + if constexpr (zero_patience) { + best_model = prev_current_model; + break; + } else { + if (p == 0) best_model = prev_current_model->clone(); + if (++p > patience) break; + accumulated_offset += validation_delta; + tabu_set.insert(best_op->opposite(*current_model)); // Add the opposite operator to the tabu set + } + } - best_op->apply(*current_model); + // Updates the previous current model + best_op->apply(*prev_current_model); - auto nodes_changed = best_op->nodes_changed(*current_model); + if (callback) callback->call(*current_model, best_op.get(), score, iter); - double validation_delta = [&]() { + util::formatted_log_t(verbose, log_str + "Updating scores"); + // NOTE: Here the scores node are reevaluated (log-likelihood fit) + op_set.update_scores(*current_model, score, nodes_changed); + util::formatted_log_t(verbose, log_str + "Scores updated"); if constexpr (std::is_base_of_v) { - return validation_delta_score(*current_model, score, nodes_changed, local_validation); + spinner->update_status(best_op->ToString() + + " | Validation delta: " + std::to_string(validation_delta)); + } else if constexpr (std::is_base_of_v) { + spinner->update_status(best_op->ToString()); } else { - return best_op->delta(); + static_assert(util::always_false, "Wrong Score class for hill-climbing."); } - }(); - if ((validation_delta + accumulated_offset) > util::machine_tol) { - if constexpr (!zero_patience) { - if (p > 0) { - best_model = current_model; - p = 0; - accumulated_offset = 0; - } + } // End of Hill climbing iterations - tabu_set.clear(); - } - } else { - if constexpr (zero_patience) { - best_model = prev_current_model; - break; - } else { - if (p == 0) best_model = prev_current_model->clone(); - if (++p > patience) break; - accumulated_offset += validation_delta; - tabu_set.insert(best_op->opposite(*current_model)); - } - } + op_set.finished(); - best_op->apply(*prev_current_model); + if (callback) callback->call(*best_model, nullptr, score, iter); - if (callback) callback->call(*current_model, best_op.get(), score, iter); + spinner->mark_as_completed("Finished Hill-climbing!"); + return best_model; + } catch (util::singular_covariance_data& e) { + util::formatted_log_t(verbose, log_str + "catch"); + throw e; + // auto arc_best_op = dynamic_cast(best_op.get()); + // auto source_arc = arc_best_op->source(); + // auto target_arc = arc_best_op->target(); - op_set.update_scores(*current_model, score, nodes_changed); + // std::cout << e.what() << std::endl; + // std::cout << "Source arc:\t" << source_arc << std::endl; + // std::cout << "Target arc:\t" << target_arc << std::endl; - if constexpr (std::is_base_of_v) { - spinner->update_status(best_op->ToString() + " | Validation delta: " + std::to_string(validation_delta)); - } else if constexpr (std::is_base_of_v) { - spinner->update_status(best_op->ToString()); - } else { - static_assert(util::always_false, "Wrong Score class for hill-climbing."); - } + // arc_blacklist_copy.push_back(std::make_pair(source_arc, target_arc)); + // std::cout << "New arc_blacklist:\t" << arc_blacklist << std::endl; + // op_set.set_arc_blacklist(arc_blacklist_copy); } - - op_set.finished(); - - if (callback) callback->call(*best_model, nullptr, score, iter); - - spinner->mark_as_completed("Finished Hill-climbing!"); - return best_model; } - +/** + * @brief Depending on the validated_score and the patience of the hill climbing algorithm it estimates the + * structure of the Bayesian network. + * + * @tparam T + * @param op_set + * @param score + * @param start + * @param arc_blacklist + * @param arc_whitelist + * @param type_blacklist + * @param type_whitelist + * @param callback + * @param max_indegree + * @param max_iters + * @param epsilon + * @param patience + * @param verbose + * @return std::shared_ptr + */ template std::shared_ptr estimate_downcast_score(OperatorSet& op_set, Score& score, @@ -274,7 +389,25 @@ std::shared_ptr estimate_downcast_score(OperatorSet& op_set, } } } - +/** + * @brief Checks the parameters of the hill climbing algorithm and estimates the structure of a Bayesian network. + * + * @tparam T + * @param op_set + * @param score + * @param start + * @param arc_blacklist + * @param arc_whitelist + * @param type_blacklist + * @param type_whitelist + * @param callback + * @param max_indegree + * @param max_iters + * @param epsilon + * @param patience + * @param verbose + * @return std::shared_ptr + */ template std::shared_ptr estimate_checks(OperatorSet& op_set, Score& score, @@ -313,6 +446,28 @@ std::shared_ptr estimate_checks(OperatorSet& op_set, class GreedyHillClimbing { public: + /** + * @brief Estimates the structure of a Bayesian network. The estimated Bayesian network is of the same type as + * start. The set of operators allowed in the search is operators. The delta score of each operator is evaluated + * using the score. The initial structure of the algorithm is the model start. + * + * @tparam T Type of the Bayesian network. + * @param op_set Set of operators in the search process. + * @param score pbn.core that drives the search. + * @param start Initial structure. A BayesianNetworkBase or ConditionalBayesianNetworkBase. + * @param arc_blacklist List of arcs blacklist (forbidden arcs). + * @param arc_whitelist List of arcs whitelist (forced arcs). + * @param type_blacklist List of type blacklist (forbidden pbn.FactorType). + * @param type_whitelist List of type whitelist (forced pbn.FactorType). + * @param callback Callback object that is called after each iteration. + * @param max_indegree Maximum indegree allowed in the graph. + * @param max_iters Maximum number of search iterations. + * @param epsilon Minimum delta score allowed for each operator. If the new operator is less than epsilon, the + * search process is stopped. + * @param patience he patience parameter (only used with pbn.ValidatedScore). + * @param verbose If True the progress will be displayed, otherwise nothing will be displayed. + * @return std::shared_ptr The estimated Bayesian network structure of the same type as start. + */ template std::shared_ptr estimate(OperatorSet& op_set, Score& score, diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp new file mode 100644 index 00000000..16a4312b --- /dev/null +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.cpp @@ -0,0 +1,689 @@ +#include +#include +#include +#include +#include +#include +#include + +using Array_ptr = std::shared_ptr; +using vptree::hash_columns; + +namespace learning::independences::hybrid { + +template +DataFrame scale_data(const DataFrame& df, const std::string& scaling) { + using ArrayType = typename arrow::TypeTraits::ArrayType; + using CType = typename ArrowType::c_type; + using kdtree::IndexComparator; + + arrow::SchemaBuilder b(arrow::SchemaBuilder::ConflictPolicy::CONFLICT_ERROR); + std::vector new_columns; + + arrow::NumericBuilder builder; + auto n_rows = df->num_rows(); + + std::vector indices(n_rows); + std::iota(indices.begin(), indices.end(), 0); + + std::vector ranked_data(n_rows); + + for (int j = 0; j < df->num_columns(); ++j) { + auto column = df.col(j); + auto dt = column->type_id(); + switch (dt) { + // discrete variables are kept as their dictionary indices + case Type::DICTIONARY: { + auto column_cast = std::static_pointer_cast(column); + auto indices = std::static_pointer_cast(column_cast->indices()); + for (int i = 0; i < n_rows; ++i) { + RAISE_STATUS_ERROR(builder.Append(static_cast(indices->Value(i)))); + } + break; + } + // transform only the continuous variables + default: { + if (scaling == "normalized_rank") { + auto dwn = df.downcast(j); + auto raw_values = dwn->raw_values(); + + IndexComparator comp(raw_values); + std::sort(indices.begin(), indices.end(), comp); + + for (int i = 0; i < n_rows; ++i) { + ranked_data[indices[i]] = static_cast(i) / static_cast(n_rows - 1); + } + + RAISE_STATUS_ERROR(builder.AppendValues(ranked_data.begin(), ranked_data.end())); + + } else if (scaling == "min_max") { + auto column_cast = std::static_pointer_cast(column); + auto min = df.min(j); + auto max = df.max(j); + if (max != min) { + for (int i = 0; i < n_rows; ++i) { + auto normalized_value = (column_cast->Value(i) - min) / (max - min); + RAISE_STATUS_ERROR(builder.Append(normalized_value)); + } + } else { + throw std::invalid_argument("Constant column in DataFrame."); + } + + } else { + throw std::invalid_argument("Invalid scaling option, must be either normalized_rank or min_max."); + } + } + } + Array_ptr out; + RAISE_STATUS_ERROR(builder.Finish(&out)); + new_columns.push_back(out); + builder.Reset(); + + auto f = arrow::field(df.name(j), out->type()); + RAISE_STATUS_ERROR(b.AddField(f)); + } + + RAISE_RESULT_ERROR(auto schema, b.Finish()) + + auto rb = arrow::RecordBatch::Make(schema, n_rows, new_columns); + return DataFrame(rb); +} + +DataFrame scale_data(const DataFrame& df, const std::string& scaling) { + // check continuous columns dtype + auto cont_cols = df.continuous_columns(); + std::shared_ptr dt; + if (cont_cols.size() > 0) { + dt = df.loc(cont_cols).same_type(); + } else { + // if fully discrete use smaller dtype + dt = std::static_pointer_cast(arrow::float32()); + } + switch (dt->id()) { + case Type::DOUBLE: + return scale_data(df, scaling); + case Type::FLOAT: + return scale_data(df, scaling); + default: + throw std::invalid_argument("Wrong data type in MixedKMutualInformation."); + } +} + +double mi_general(VPTree& ztree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed) { + auto n_rows = df->num_rows(); + VPTree vptree(df, datatype, is_discrete_column, tree_leafsize, seed); + // excluding the reference point which is not a neighbor of itself + auto knn_results = vptree.query(df, k + 1); + + VectorXd eps(n_rows); + VectorXi k_hat(n_rows); + + for (auto i = 0; i < n_rows; ++i) { + eps(i) = knn_results[i].first(k); + k_hat(i) = knn_results[i].second.size(); + if (k == 1 && eps(i) == std::numeric_limits::infinity()) { + k_hat(i) = 1; + eps(i) = 0.0; + } + } + + // use the ztree to search in all Z, XZ and YZ subspaces + auto [n_xz, n_yz, n_z] = ztree.count_ball_subspaces(df, eps, is_discrete_column); + + double res = 0; + auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; + +#pragma omp parallel for reduction(+ : res) + for (int i = 0; i < n_rows; ++i) { + res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(exclude_self(n_z(i))) - + boost::math::digamma(exclude_self(n_xz(i))) - boost::math::digamma(exclude_self(n_yz(i))); + } + + res /= n_rows; + + return res; +} + +double mi_pair(VPTree& ytree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed) { + auto n_rows = df->num_rows(); + VPTree xytree(df, datatype, is_discrete_column, tree_leafsize, seed); + // excluding the reference point which is not a neighbor of itself + auto knn_results = xytree.query(df, k + 1); + + VectorXd eps(n_rows); + VectorXi k_hat(n_rows); + + for (auto i = 0; i < n_rows; ++i) { + eps(i) = knn_results[i].first[k]; + k_hat(i) = knn_results[i].second.size(); + if (k == 1 && eps(i) == std::numeric_limits::infinity()) { + k_hat(i) = 1; + eps(i) = 0.0; + } + } + + auto x_is_discrete_column = std::vector(is_discrete_column.begin(), is_discrete_column.begin() + 1); + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + + auto x_df = df.loc(0); + auto y_df = df.loc(1); + + VPTree xtree(x_df, datatype, x_is_discrete_column, tree_leafsize, seed); + + auto n_x = xtree.count_ball_unconditional(x_df, eps, x_is_discrete_column); + auto n_y = ytree.count_ball_unconditional(y_df, eps, y_is_discrete_column); + + double res = 0; + auto exclude_self = [](int value) { return (value > 1) ? (value - 1) : value; }; + +#pragma omp parallel for reduction(+ : res) + for (int i = 0; i < n_rows; ++i) { + // Z is treated as a constant column, thus n_z = n_rows - 1 + res += boost::math::digamma(exclude_self(k_hat(i))) + boost::math::digamma(n_rows - 1) - + boost::math::digamma(exclude_self(n_x(i))) - boost::math::digamma(exclude_self(n_y(i))); + } + + res /= n_rows; + + return res; +} + +int MixedKMutualInformation::find_minimum_cluster_size(const std::vector& discrete_vars) const { + auto dummy_vars = std::vector(discrete_vars.begin() + 1, discrete_vars.end()); + + auto [cardinality, strides] = factors::discrete::create_cardinality_strides(m_df, discrete_vars); + + auto joint_counts = factors::discrete::joint_counts(m_df, discrete_vars[0], dummy_vars, cardinality, strides); + + int min_cluster_size = std::numeric_limits::max(); + + // find minimum positive cluster size + for (int i = 0; i < joint_counts.size(); ++i) { + if (joint_counts[i] > 1 && joint_counts[i] < min_cluster_size) { + min_cluster_size = joint_counts[i]; + } + } + + return min_cluster_size; +} + +int MixedKMutualInformation::find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, + const std::vector& discrete_vars) const { + // hash the columns as they are no longer of type arrow::DictionaryArray + std::unordered_map joint_counts; + switch (m_datatype->id()) { + case Type::FLOAT: { + auto data = shuffled_df.downcast_vector(discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars, true); + for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { + joint_counts[hashed_cols[i]]++; + } + break; + } + default: { + auto data = shuffled_df.downcast_vector(discrete_vars); + auto hashed_cols = hash_columns(data, discrete_vars, true); + for (long unsigned int i = 0; i < hashed_cols.size(); ++i) { + joint_counts[hashed_cols[i]]++; + } + } + } + int min_cluster_size = std::numeric_limits::max(); + + // find minimum positive cluster size + for (const auto& [config, count] : joint_counts) { + if (count > 1 && count < min_cluster_size) { + min_cluster_size = count; + } + } + + return min_cluster_size; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y) { + is_discrete_column.push_back(df.is_discrete(x)); + is_discrete_column.push_back(df.is_discrete(y)); + + std::vector discrete_vars; + if (is_discrete_column[0]) { + discrete_vars.push_back(x); + discrete_present = true; + } + if (is_discrete_column[1]) { + discrete_vars.push_back(y); + discrete_present = true; + } + return discrete_vars; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y, + const std::string& z) { + auto discrete_vars = check_discrete_cols(df, is_discrete_column, discrete_present, x, y); + is_discrete_column.push_back(df.is_discrete(z)); + + if (is_discrete_column.back()) { + discrete_vars.push_back(z); + discrete_present = true; + } + + return discrete_vars; +} + +std::vector check_discrete_cols(const DataFrame& df, + std::vector& is_discrete_column, + bool& discrete_present, + const std::string& x, + const std::string& y, + const std::vector& z) { + auto discrete_vars = check_discrete_cols(df, is_discrete_column, discrete_present, x, y); + for (const auto& col : z) { + is_discrete_column.push_back(df.is_discrete(col)); + if (is_discrete_column.back()) { + discrete_vars.push_back(col); + discrete_present = true; + } + } + + return discrete_vars; +} + +double MixedKMutualInformation::mi(const std::string& x, const std::string& y) const { + auto subset_df = m_scaled_df.loc(x, y); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + auto y_df = subset_df.loc(1); + VPTree ytree(y_df, m_datatype, y_is_discrete_column, m_tree_leafsize, m_seed); + + return mi_pair(ytree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double MixedKMutualInformation::mi(const std::string& x, const std::string& y, const std::string& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto z_df = subset_df.loc(2); + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double MixedKMutualInformation::mi(const std::string& x, + const std::string& y, + const std::vector& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto z_df = m_scaled_df.loc(z); + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + return mi_general(ztree, subset_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); +} + +double compute_mean(const std::vector& data) { + return std::accumulate(data.begin(), data.end(), 0.0) / data.size(); +} + +double compute_variance(const std::vector& data, double mean) { + double variance = 0.0; + for (double x : data) { + variance += std::pow((x - mean), 2); + } + return variance / data.size(); +} + +double compute_skewness(const std::vector& data, double mean, double variance) { + double skewness = 0.0; + + for (double x : data) { + skewness += std::pow(x - mean, 3); + } + + return (skewness / data.size()) / std::pow(variance, 1.5); +} +double compute_pvalue(double original_mi, std::vector& permutation_stats, bool gamma_approx) { + double min_value = *std::min_element(permutation_stats.begin(), permutation_stats.end()); + double max_value = *std::max_element(permutation_stats.begin(), permutation_stats.end()); + + if (original_mi > max_value) { + return 1.0 / static_cast((permutation_stats.size() + 1)); + } else if (original_mi <= min_value) { + return 1.0; + } + + if (gamma_approx) { + // include unpermuted statistic for conservative p-value estimation + permutation_stats.push_back(original_mi); + + double mean = compute_mean(permutation_stats); + double variance = compute_variance(permutation_stats, mean); + double skewness = compute_skewness(permutation_stats, mean, variance); + + // standardise to mu=0 std=1 to fit a Pearson type III PDF (Minas & Montana, 2014) + for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { + permutation_stats[i] = (permutation_stats[i] - mean) / std::sqrt(variance); + } + + auto z_value = permutation_stats.back(); + permutation_stats.pop_back(); + + if (skewness == 0.0) { + // Standard normal distribution + boost::math::normal_distribution<> standard_normal(0.0, 1.0); + return boost::math::cdf(boost::math::complement(standard_normal, z_value)); + } + double k, theta, c; + k = 4 / std::pow(skewness, 2); // shape + theta = skewness / 2.0; // scale + c = -2.0 / skewness; // location shift + + auto x_value = (z_value - c) / theta; + + // fit gamma using method of moments to compute the p-value + + if (skewness > 0) { + if (x_value >= util::machine_tol) { // practically 0, but avoids convergence timeouts + return boost::math::gamma_q<>(k, x_value); // upper tail + } + + return 1.0; // outside gamma support + } + + else if (x_value >= util::machine_tol) { + return boost::math::gamma_p<>(k, x_value); // lower tail + } + + return 1.0 / static_cast((permutation_stats.size() + 1)); // outside gamma support + } + + // crude Monte Carlo p-value computation + int count_greater = 1; + + for (long unsigned int i = 0; i < permutation_stats.size(); ++i) { + if (permutation_stats[i] >= original_mi) ++count_greater; + } + + return static_cast(count_greater) / static_cast((permutation_stats.size() + 1)); +} + +double MixedKMutualInformation::pvalue(const std::string& x, const std::string& y) const { + std::mt19937 rng{m_seed}; + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y); + + // the adaptive k affects both the CMI estimates and the shuffling + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto y_is_discrete_column = std::vector(is_discrete_column.begin() + 1, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y); + auto y_df = shuffled_df.loc(1); + + // reuse the ytree as the Y column will not be shuffled + VPTree ytree(y_df, m_datatype, y_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + std::vector permutation_stats(m_samples); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto x_begin = shuffled_df.template mutable_data(0); + auto x_end = x_begin + shuffled_df->num_rows(); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(x_begin, x_end, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + auto shuffled_value = + mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + permutation_stats[i] = shuffled_value; + } + break; + } + + default: { + auto x_begin = shuffled_df.template mutable_data(0); + auto x_end = x_begin + shuffled_df->num_rows(); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(x_begin, x_end, rng); + // we compute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + auto shuffled_value = + mi_pair(ytree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + permutation_stats[i] = shuffled_value; + } + } + } + + return compute_pvalue(original_mi, permutation_stats, m_gamma_approx); +} + +double MixedKMutualInformation::pvalue(const std::string& x, const std::string& y, const std::string& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + int shuffle_neighbors = m_shuffle_neighbors; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size - 1); + } + + auto x_df = subset_df.loc(0); + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y, z); + auto z_df = shuffled_df.loc(2); + + // reuse the ztree as the Z column will not be shuffled + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + return shuffled_pvalue( + original_mi, k, shuffle_neighbors, x_df, ztree, z_df, shuffled_df, is_discrete_column, discrete_vars); +} + +double MixedKMutualInformation::pvalue(const std::string& x, + const std::string& y, + const std::vector& z) const { + auto subset_df = m_scaled_df.loc(x, y, z); + std::vector is_discrete_column; + bool discrete_present = false; + int k = m_k; + int shuffle_neighbors = m_shuffle_neighbors; + auto discrete_vars = check_discrete_cols(m_df, is_discrete_column, discrete_present, x, y, z); + + // the adaptive k affects both the CMI estimates and the shuffling + if (discrete_present && m_adaptive_k) { + auto min_cluster_size = find_minimum_cluster_size(discrete_vars); + k = std::min(k, min_cluster_size - 1); + shuffle_neighbors = std::min(shuffle_neighbors, min_cluster_size - 1); + } + + auto x_df = subset_df.loc(0); + + auto z_is_discrete_column = std::vector(is_discrete_column.begin() + 2, is_discrete_column.end()); + auto shuffled_df = m_scaled_df.loc(Copy(x), y, z); + auto z_df = shuffled_df.loc(z); + + // reuse the ztree as the Z column will not be shuffled + VPTree ztree(z_df, m_datatype, z_is_discrete_column, m_tree_leafsize, m_seed); + + auto original_mi = mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + return shuffled_pvalue( + original_mi, k, shuffle_neighbors, x_df, ztree, z_df, shuffled_df, is_discrete_column, discrete_vars); +} + +/* tries to perform shuffling without replacement */ +template +void shuffle_dataframe(const CType* original_x, + CType* shuffled_x, + const std::vector& order, + std::vector& used, + std::vector& neighbors, + Random& rng) { + // first shuffle the neighbors found in the Z subspace + for (auto& neighbor_list : neighbors) { + auto begin = neighbor_list.data(); + auto end = begin + neighbor_list.size(); + std::shuffle(begin, end, rng); + } + + // using the random order, replace instance with the first unused shuffled neighbor + for (long unsigned int i = 0; i < order.size(); ++i) { + size_t index = order[i]; + int neighbor_index = 0; + long int j = 0; + + for (; j < neighbors[index].size(); ++j) { + neighbor_index = neighbors[index][j]; + if (!used[neighbor_index]) { + break; + } + } + + // if there were collisions, keep instance with original value + if (j == neighbors[index].size()) neighbor_index = index; + + shuffled_x[index] = original_x[neighbor_index]; + used[neighbor_index] = true; + } +} + +double MixedKMutualInformation::shuffled_pvalue(double original_mi, + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const { + std::minstd_rand rng{m_seed}; + std::vector neighbors(m_df->num_rows()); + + auto zknn = ztree.query(z_df, shuffle_neighbors); + + for (size_t i = 0; i < zknn.size(); ++i) { + neighbors[i] = zknn[i].second; + } + + std::vector order(m_df->num_rows()); + std::iota(order.begin(), order.end(), 0); + + std::vector used(m_df->num_rows(), false); + std::vector permutation_stats(m_samples); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto original_x = x_df.template data(0); + auto shuffled_x = shuffled_df.template mutable_data(0); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(order.begin(), order.end(), rng); + shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); + // we recompute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto shuffled_value = + mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + permutation_stats[i] = shuffled_value; + + std::fill(used.begin(), used.end(), false); + } + break; + } + + default: { + auto original_x = x_df.template data(0); + auto shuffled_x = shuffled_df.template mutable_data(0); + + for (int i = 0; i < m_samples; ++i) { + std::shuffle(order.begin(), order.end(), rng); + shuffle_dataframe(original_x, shuffled_x, order, used, neighbors, rng); + // we recompute the adaptive k only if X is discrete + if (is_discrete_column[0] && m_adaptive_k) { + auto min_cluster_size = find_minimum_shuffled_cluster_size(shuffled_df, discrete_vars); + k = std::min(k, min_cluster_size - 1); + } + + auto shuffled_value = + mi_general(ztree, shuffled_df, k, m_datatype, is_discrete_column, m_tree_leafsize, m_seed); + + permutation_stats[i] = shuffled_value; + + std::fill(used.begin(), used.end(), false); + } + } + } + + return compute_pvalue(original_mi, permutation_stats, m_gamma_approx); +} + +} // namespace learning::independences::hybrid \ No newline at end of file diff --git a/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp new file mode 100644 index 00000000..10346ea1 --- /dev/null +++ b/pybnesian/learning/independences/hybrid/mixed_knncmi.hpp @@ -0,0 +1,102 @@ +#ifndef PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP +#define PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP + +#include +#include +#include +#include + +using dataset::DataFrame, dataset::Copy; +using Eigen::MatrixXi; +using Array_ptr = std::shared_ptr; +using vptree::VPTree; + +namespace learning::independences::hybrid { +DataFrame scale_data(const DataFrame& df, const std::string& scaling); + +double mi_general(VPTree& ztree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed); +double mi_pair(VPTree& ytree, + DataFrame& df, + int k, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int tree_leafsize, + unsigned int seed); + +class MixedKMutualInformation : public IndependenceTest { +public: + MixedKMutualInformation(DataFrame df, + int k, + unsigned int seed = std::random_device{}(), + int shuffle_neighbors = 5, + int samples = 1000, + std::string scaling = "min_max", + bool gamma_approx = true, + bool adaptive_k = true, + int tree_leafsize = 16) + : m_df(df), + m_scaled_df(scale_data(df, scaling)), + m_datatype(), + m_k(k), + m_seed(seed), + m_shuffle_neighbors(shuffle_neighbors), + m_samples(samples), + m_gamma_approx(gamma_approx), + m_adaptive_k(adaptive_k), + m_tree_leafsize(tree_leafsize) { + m_datatype = m_scaled_df.same_type(); + } + + double pvalue(const std::string& x, const std::string& y) const override; + double pvalue(const std::string& x, const std::string& y, const std::string& z) const override; + double pvalue(const std::string& x, const std::string& y, const std::vector& z) const override; + + double mi(const std::string& x, const std::string& y) const; + double mi(const std::string& x, const std::string& y, const std::string& z) const; + double mi(const std::string& x, const std::string& y, const std::vector& z) const; + + int num_variables() const override { return m_df->num_columns(); } + + std::vector variable_names() const override { return m_df.column_names(); } + + const std::string& name(int i) const override { return m_df.name(i); } + + bool has_variables(const std::string& name) const override { return m_df.has_columns(name); } + + bool has_variables(const std::vector& cols) const override { return m_df.has_columns(cols); } + +private: + double shuffled_pvalue(double original_mi, + int k, + int shuffle_neighbors, + DataFrame& x_df, + VPTree& ztree, + DataFrame& z_df, + DataFrame& shuffled_df, + std::vector& is_discrete_column, + std::vector& discrete_vars) const; + + int find_minimum_cluster_size(const std::vector& discrete_vars) const; + int find_minimum_shuffled_cluster_size(const DataFrame& shuffled_df, + const std::vector& discrete_vars) const; + DataFrame m_df; + DataFrame m_scaled_df; + std::shared_ptr m_datatype; + int m_k; + unsigned int m_seed; + int m_shuffle_neighbors; + int m_samples; + bool m_gamma_approx; + bool m_adaptive_k; + int m_tree_leafsize; +}; + +} // namespace learning::independences::hybrid + +#endif // PYBNESIAN_LEARNING_INDEPENDENCES_HYBRID_MS_MUTUAL_INFORMATION_HPP \ No newline at end of file diff --git a/pybnesian/learning/independences/hybrid/mutual_information.cpp b/pybnesian/learning/independences/hybrid/mutual_information.cpp index 4e5d5c87..029da9e9 100644 --- a/pybnesian/learning/independences/hybrid/mutual_information.cpp +++ b/pybnesian/learning/independences/hybrid/mutual_information.cpp @@ -920,6 +920,7 @@ std::pair conditional_covariance( double entropy_mvn(int dimensionality, double cov_det) { auto d = static_cast(dimensionality); + return 0.5 * d + 0.5 * d * std::log(2 * util::pi) + 0.5 * std::log(cov_det); } @@ -951,7 +952,7 @@ double MutualInformation::mi_discrete(const std::string& x, const std::string& y } } - return mi; + return std::max(mi, util::machine_tol); } template @@ -1020,6 +1021,7 @@ double MutualInformation::mi_mixed_impl(const std::string& discrete, const std:: // Add H(Y_C) double mi = 0.5 + 0.5 * std::log(2 * util::pi * total_variance); + for (auto j = 0; j < num_categories; ++j) { if (counts(j) > 0) { auto pj = static_cast(counts(j)) / total_counts; @@ -1029,7 +1031,7 @@ double MutualInformation::mi_mixed_impl(const std::string& discrete, const std:: } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::mi_mixed(const std::string& discrete, const std::string& continuous) const { @@ -1056,9 +1058,10 @@ template double MutualInformation::mi_continuous_impl(const std::string& x, const std::string& y) const { auto pcov = m_df.cov(x, y); auto& cov = *pcov; - auto cor = cov(0, 1) / sqrt(cov(0, 0) * cov(1, 1)); - return -0.5 * std::log(1 - cor * cor); + + auto mi = -0.5 * std::log(1 - cor * cor); + return std::max(mi, util::machine_tol); } double MutualInformation::mi_continuous(const std::string& x, const std::string& y) const { @@ -1124,8 +1127,14 @@ double MutualInformation::calculate_df(const std::string& x, const std::string& double MutualInformation::pvalue(const std::string& x, const std::string& y) const { auto mi_value = mi(x, y); + // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } + auto df = calculate_df(x, y); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1250,7 +1259,8 @@ double MutualInformation::cmi_discrete_continuous_impl(const std::string& x, double pi = static_cast(ni) / total_counts; double pj = static_cast(nj) / total_counts; - auto h_xy = 0.5 + 0.5 * std::log(2 * util::pi * variance_xy(k)); + double h_xy = 0.5 + 0.5 * std::log(2 * util::pi * variance_xy(k)); + mi += pij * (-h_xy + std::log(pij / (pi * pj))); } } @@ -1276,7 +1286,7 @@ double MutualInformation::cmi_discrete_continuous_impl(const std::string& x, // Sum - H(Z) mi -= 0.5 + 0.5 * std::log(2 * util::pi * total_variance); - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_discrete_continuous(const std::string& x, @@ -1378,6 +1388,11 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con auto mi_value = mi(x, y, z); // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y, z); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } + auto df = calculate_df(x, y, z); boost::math::chi_squared_distribution chidist(static_cast(df)); @@ -1442,7 +1457,7 @@ double MutualInformation::cmi_discrete_discrete(const std::string& x, } // mi contains N*MI(X; Y). - return mi; + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_both_discrete(const std::string& x, @@ -1527,7 +1542,7 @@ double MutualInformation::cmi_general_both_discrete(const std::string& x, mi -= pz * h_z; } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_mixed(const std::string& x_discrete, @@ -1584,7 +1599,7 @@ double MutualInformation::cmi_general_mixed(const std::string& x_discrete, } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general_both_continuous(const std::string& x, @@ -1621,7 +1636,7 @@ double MutualInformation::cmi_general_both_continuous(const std::string& x, } } - return std::max(mi, 0.); + return std::max(mi, util::machine_tol); } double MutualInformation::cmi_general(const std::string& x, @@ -1744,6 +1759,11 @@ double MutualInformation::pvalue(const std::string& x, const std::string& y, con auto mi_value = cmi_general(x, y, discrete_z, continuous_z); // Multiply by 2*N to obtain 2*N*MI(X; Y). This follows a X^2 distribution. mi_value *= 2 * m_df.valid_rows(x, y, z); + + if (std::isinf(mi_value) || std::isnan(mi_value)) { + return 1; + } + auto df = calculate_df(x, y, discrete_z, continuous_z); boost::math::chi_squared_distribution chidist(static_cast(df)); diff --git a/pybnesian/learning/operators/operators.cpp b/pybnesian/learning/operators/operators.cpp index 826c512e..f97f5fea 100644 --- a/pybnesian/learning/operators/operators.cpp +++ b/pybnesian/learning/operators/operators.cpp @@ -16,6 +16,12 @@ std::shared_ptr AddArc::opposite(const ConditionalBayesianNetworkBase& return opposite(static_cast(m)); } +/** + * @brief Updates the valid operations matrix and the delta matrix. + * The idea is that arc_whitelist and arc_blacklist are operations that have to be ignored. + * + * @param model BayesianNetwork. + */ void ArcOperatorSet::update_valid_ops(const BayesianNetworkBase& model) { int num_nodes = model.num_nodes(); @@ -96,6 +102,12 @@ double cache_score_operation(const BayesianNetworkBase& model, return d; } } +/** + * @brief Cache scores for the given BayesianNetwork and ArcOperator score. + * + * @param model BayesianNetwork. + * @param score Score. + */ void ArcOperatorSet::cache_scores(const BayesianNetworkBase& model, const Score& score) { if (!score.compatible_bn(model)) { @@ -108,16 +120,18 @@ void ArcOperatorSet::cache_scores(const BayesianNetworkBase& model, const Score& this->m_local_cache->cache_local_scores(model, score); } - update_valid_ops(model); + update_valid_ops(model); // Updates a matrix of valid operations and a matrix of delta scores. auto bn_type = model.type(); - for (const auto& target_node : model.nodes()) { + for (const auto& target_node : model.nodes()) { // Iterates over all target_node in the model. std::vector new_parents_target = model.parents(target_node); int target_collapsed = model.collapsed_index(target_node); - for (const auto& source_node : model.nodes()) { + for (const auto& source_node : model.nodes()) { // Iterates over all source_node in the model. int source_collapsed = model.collapsed_index(source_node); if (valid_op(source_collapsed, target_collapsed) && - bn_type->can_have_arc(model, source_node, target_node)) { + bn_type->can_have_arc( + model, source_node, target_node)) { // If the arc operation (source_node, target_node) is valid. + // NOTE: FIXED Here the score is calculated and may fail if the covariance matrix is singular. delta(source_collapsed, target_collapsed) = cache_score_operation(model, score, @@ -209,7 +223,13 @@ void ArcOperatorSet::update_valid_ops(const ConditionalBayesianNetworkBase& mode } } } - +/** + * @brief Cache scores for the given ConditionalBayesianNetwork and ArcOperator score. + * + * @param model BayesianNetwork. + * @param score Score. + */ +// TODO: Update ConditionalBayesianNetworkBase for singular covariance? void ArcOperatorSet::cache_scores(const ConditionalBayesianNetworkBase& model, const Score& score) { if (!score.compatible_bn(model)) { throw std::invalid_argument("BayesianNetwork is not compatible with the score."); @@ -292,53 +312,83 @@ std::shared_ptr ArcOperatorSet::find_max(const ConditionalBayesianNetw else return find_max_indegree(model, tabu_set); } - +/** + * @brief Find the maximum operation for the given BayesianNetwork and ArcOperatorSet score. + * + * @param model + * @param score + * @param target_node + */ void ArcOperatorSet::update_incoming_arcs_scores(const BayesianNetworkBase& model, const Score& score, const std::string& target_node) { auto target_collapsed = model.collapsed_index(target_node); - auto parents = model.parents(target_node); + auto parents = model.parents(target_node); // The parents of the target_node auto bn_type = model.type(); for (const auto& source_node : model.nodes()) { auto source_collapsed = model.collapsed_index(source_node); if (valid_op(source_collapsed, target_collapsed)) { - if (model.has_arc(source_node, target_node)) { - // Update remove arc: source_node -> target_node - util::swap_remove_v(parents, source_node); - double d = score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, target_node); - parents.push_back(source_node); - delta(source_collapsed, target_collapsed) = d; - - // Update flip arc: source_node -> target_node + // ARC FLIPPING source_node -> target_node to target_node -> source_node: + if (model.has_arc(source_node, + target_node)) { // If the arc source_node -> target_node already exists, remove it and + // then put the reverse arc if possible. + util::swap_remove_v(parents, source_node); // Remove source_node from the parents of target_node + // score of removing (source_collapsed -> target_node) + double d = score.local_score(model, target_node, parents) - // New score with the removed arc + this->m_local_cache->local_score(model, target_node); // Old score with the arc + parents.push_back(source_node); // Readd source_node to the parents of target_node + delta(source_collapsed, target_collapsed) = d; // score of removing (source_collapsed -> target_node) + + // Update flip arc: source_node -> target_node to target_node -> source_node if (valid_op(target_collapsed, source_collapsed) && - bn_type->can_have_arc(model, target_node, source_node)) { + bn_type->can_have_arc( + model, target_node, source_node)) { // If the reverse arc (target_node -> source_node) is + // possible, then put the reverse arc auto parents_source = model.parents(source_node); parents_source.push_back(target_node); - delta(target_collapsed, source_collapsed) = d + - score.local_score(model, source_node, parents_source) - - this->m_local_cache->local_score(model, source_node); + double d2; + // score of adding (target_node -> source_collapsed) + d2 = d + score.local_score(model, source_node, parents_source) - // New score with the added arc + this->m_local_cache->local_score(model, source_node); // Old score without the arc + delta(target_collapsed, source_collapsed) = + d2; // score of reversing (source_collapsed -> target_node) to (target_node -> + // source_collapsed) } } else if (model.has_arc(target_node, source_node) && - bn_type->can_have_arc(model, source_node, target_node)) { - // Update flip arc: target_node -> source_node + bn_type->can_have_arc( + model, + source_node, + target_node)) { // ARC FLIPPING target_node -> source_node to source_node -> target_node: + // If the arc target_node -> source_node already exists and the reverse arc + // is possible, then put the flip the arc to source_node -> target_node. auto parents_source = model.parents(source_node); - util::swap_remove_v(parents_source, target_node); + util::swap_remove_v(parents_source, target_node); // Remove target_node from the parents of source_node parents.push_back(source_node); - double d = score.local_score(model, source_node, parents_source) + - score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, source_node) - - this->m_local_cache->local_score(model, target_node); + + // Update flip arc score: target_node -> source_node to source_node -> target_node + double d; + d = score.local_score(model, + target_node, + parents) + // New score after adding source_node as parent of target_node + score.local_score( + model, + source_node, + parents_source) - // New score after removing target_node as parent of source_node + this->m_local_cache->local_score(model, target_node) - + this->m_local_cache->local_score(model, source_node); + parents.pop_back(); + // TODO: Is necessary parents_source.push_back(target_node);? delta(source_collapsed, target_collapsed) = d; } else if (bn_type->can_have_arc(model, source_node, target_node)) { // Update add arc: source_node -> target_node parents.push_back(source_node); - double d = score.local_score(model, target_node, parents) - - this->m_local_cache->local_score(model, target_node); + double d; + d = score.local_score(model, target_node, parents) - + this->m_local_cache->local_score(model, target_node); parents.pop_back(); delta(source_collapsed, target_collapsed) = d; } @@ -366,7 +416,7 @@ void ArcOperatorSet::update_incoming_arcs_scores(const ConditionalBayesianNetwor const Score& score, const std::string& target_node) { auto target_collapsed = model.collapsed_index(target_node); - auto parents = model.parents(target_node); + auto parents = model.parents(target_node); // The parents of the target_node auto bn_type = model.type(); for (const auto& source_node : model.joint_nodes()) { @@ -435,7 +485,12 @@ void ArcOperatorSet::update_scores(const ConditionalBayesianNetworkBase& model, update_incoming_arcs_scores(model, score, n); } } - +/** + * @brief Cache scores for the given BayesianNetwork and ChangeNodeTypeSet score. + * + * @param model BayesianNetwork. + * @param score Score. + */ void ChangeNodeTypeSet::cache_scores(const BayesianNetworkBase& model, const Score& score) { if (model.type_ref().is_homogeneous()) { throw std::invalid_argument("ChangeNodeTypeSet can only be used with non-homogeneous Bayesian networks."); diff --git a/pybnesian/learning/operators/operators.hpp b/pybnesian/learning/operators/operators.hpp index a624c830..025d4ddc 100644 --- a/pybnesian/learning/operators/operators.hpp +++ b/pybnesian/learning/operators/operators.hpp @@ -21,7 +21,7 @@ namespace learning::operators { class Operator { public: Operator(double delta) : m_delta(delta) {} - virtual ~Operator(){}; + virtual ~Operator() {}; virtual bool is_python_derived() const { return false; } @@ -292,26 +292,43 @@ class OperatorTabuSet { SetType m_set; }; +/** + * @brief Cache of local scores for each node in the network. + * + */ class LocalScoreCache { public: LocalScoreCache() : m_local_score() {} LocalScoreCache(const BayesianNetworkBase& m) : m_local_score(m.num_nodes()) {} + /** + * @brief Cache local scores for each node in the network. + * + * @param model Bayesian network + * @param score Score + */ void cache_local_scores(const BayesianNetworkBase& model, const Score& score) { + // Checks if the cache has the right size if (m_local_score.rows() != model.num_nodes()) { m_local_score = VectorXd(model.num_nodes()); } - + // Caches the local score for each node for (const auto& node : model.nodes()) { m_local_score(model.collapsed_index(node)) = score.local_score(model, node); } } - + /** + * @brief Cache Validated local scores for each node in the network. + * + * @param model Bayesian network + * @param score Validated score + */ void cache_vlocal_scores(const BayesianNetworkBase& model, const ValidatedScore& score) { + // Checks if the cache has the right size if (m_local_score.rows() != model.num_nodes()) { m_local_score = VectorXd(model.num_nodes()); } - + // Caches the validated local score for each node for (const auto& node : model.nodes()) { m_local_score(model.collapsed_index(node)) = score.vlocal_score(model, node); } @@ -833,6 +850,13 @@ class OperatorPool : public OperatorSet { std::vector> m_op_sets; }; +/** + * @brief Cache local scores for each of the operators in the pool with the given model and score. + * + * @tparam M Model type + * @param model Bayesian network + * @param score Score + */ template void OperatorPool::cache_scores(const M& model, const Score& score) { if (!this->m_local_cache) { diff --git a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp index 6b62c8f7..f7fa81bd 100644 --- a/pybnesian/learning/parameters/mle_DiscreteFactor.cpp +++ b/pybnesian/learning/parameters/mle_DiscreteFactor.cpp @@ -30,9 +30,12 @@ typename DiscreteFactor::ParamsClass _fit(const DataFrame& df, logprob(offset + i) = loguniform; } } else { - double logsum_configuration = std::log(static_cast(sum_configuration)); + // Schurmann-Grassberger smoothing, lambda = 1 (uniform prior) + double lambda = 1 / cardinality(0); + double logsum_configuration = std::log(static_cast(sum_configuration + lambda * cardinality(0))); for (auto i = 0; i < cardinality(0); ++i) { - logprob(offset + i) = std::log(static_cast(joint_counts(offset + i))) - logsum_configuration; + logprob(offset + i) = + std::log(static_cast(joint_counts(offset + i) + lambda)) - logsum_configuration; } } } diff --git a/pybnesian/learning/scores/cv_likelihood.cpp b/pybnesian/learning/scores/cv_likelihood.cpp index 19344f6c..b7086f3c 100644 --- a/pybnesian/learning/scores/cv_likelihood.cpp +++ b/pybnesian/learning/scores/cv_likelihood.cpp @@ -8,15 +8,27 @@ double CVLikelihood::local_score(const BayesianNetworkBase& model, return local_score(model, model.underlying_node_type(m_cv.data(), variable), variable, evidence); } +/** + * @brief Calculates the local score (cross-validated log-likelihood) of a variable given the evidence. + * NOTE: Requires fitting a cpd for each fold. + * @param model + * @param variable_type + * @param variable + * @param evidence + * @return double + */ double CVLikelihood::local_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, const std::vector& evidence) const { auto [args, kwargs] = m_arguments.args(variable, variable_type); + // Initialize the CPD auto cpd = variable_type->new_factor(model, variable, evidence, args, kwargs); + // Calculates the log-likelihood for each fold double loglik = 0; for (auto [train_df, test_df] : m_cv.loc(variable, evidence)) { + // NOTE: This fit fails if the Covariance matrix is not positive definite cpd->fit(train_df); loglik += cpd->slogl(test_df); } diff --git a/pybnesian/learning/scores/cv_likelihood.hpp b/pybnesian/learning/scores/cv_likelihood.hpp index 6b481e73..87d6d043 100644 --- a/pybnesian/learning/scores/cv_likelihood.hpp +++ b/pybnesian/learning/scores/cv_likelihood.hpp @@ -11,6 +11,11 @@ using models::BayesianNetworkBase, models::BayesianNetworkType; namespace learning::scores { +/** + * @brief This class implements an estimation of the log-likelihood on unseen data using k-fold cross validation over + * the data. + * + */ class CVLikelihood : public Score { public: CVLikelihood(const DataFrame& df, diff --git a/pybnesian/learning/scores/holdout_likelihood.hpp b/pybnesian/learning/scores/holdout_likelihood.hpp index c7b9dfa6..8ddd1d8a 100644 --- a/pybnesian/learning/scores/holdout_likelihood.hpp +++ b/pybnesian/learning/scores/holdout_likelihood.hpp @@ -12,6 +12,11 @@ using models::GaussianNetwork, models::SemiparametricBN; namespace learning::scores { +/** + * @brief This class implements an estimation of the log-likelihood on unseen data using a holdout dataset. Thus, the + * parameters are estimated using training data, and the score is estimated in the holdout data. + * + */ class HoldoutLikelihood : public Score { public: HoldoutLikelihood(const DataFrame& df, diff --git a/pybnesian/learning/scores/validated_likelihood.hpp b/pybnesian/learning/scores/validated_likelihood.hpp index 4bb98d45..e7dd7456 100644 --- a/pybnesian/learning/scores/validated_likelihood.hpp +++ b/pybnesian/learning/scores/validated_likelihood.hpp @@ -8,7 +8,15 @@ using learning::scores::ValidatedScore, learning::scores::HoldoutLikelihood, learning::scores::CVLikelihood; namespace learning::scores { - +/** + * @brief This class mixes the functionality of CVLikelihood and HoldoutLikelihood. First, it applies a HoldOut split + over the data. Then: + - It estimates the training score using a CVLikelihood over the training data. + - It estimates the validation score using the training data to estimate the parameters and calculating the + log-likelihood on the holdout data. + + * + */ class ValidatedLikelihood : public ValidatedScore { public: ValidatedLikelihood(const DataFrame& df, @@ -26,7 +34,15 @@ class ValidatedLikelihood : public ValidatedScore { const std::vector& parents) const override { return m_cv.local_score(model, variable, parents); } - + /** + * @brief Calculates the cross-validated log-likelihood of a variable given its parents. + * + * @param model + * @param variable_type + * @param variable + * @param parents + * @return double + */ double local_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, @@ -51,7 +67,15 @@ class ValidatedLikelihood : public ValidatedScore { const std::vector& evidence) const override { return m_holdout.local_score(model, variable, evidence); } - + /** + * @brief Calculates the validated local score of a variable given the evidence. + * + * @param model BayesianNetworkBase + * @param variable_type FactorType + * @param variable the variable name + * @param evidence the evidence vector + * @return double the validated local score + */ double vlocal_score(const BayesianNetworkBase& model, const std::shared_ptr& variable_type, const std::string& variable, diff --git a/pybnesian/models/BayesianNetwork.hpp b/pybnesian/models/BayesianNetwork.hpp index 830df3b6..8f9b492e 100644 --- a/pybnesian/models/BayesianNetwork.hpp +++ b/pybnesian/models/BayesianNetwork.hpp @@ -62,6 +62,12 @@ class BayesianNetworkBase : public std::enable_shared_from_thiscan_have_arc(*this, target, source); } - + /** + * @brief Include the given whitelisted arcs. It checks the validity of the graph after including the arc whitelist. + * + * @param arc_whitelist List of arcs to add. + */ void force_whitelist(const ArcStringVector& arc_whitelist) override { for (const auto& arc : arc_whitelist) { if (!has_arc(arc.first, arc.second)) { - if (has_arc(arc.second, arc.first)) { + if (has_arc(arc.second, arc.first)) { // Check if the reverse arc is present throw std::invalid_argument("Arc " + arc.first + " -> " + arc.second + " in whitelist," " but arc " + arc.second + " -> " + arc.first + " is present" " in the Bayesian Network."); - } else if (can_add_arc(arc.first, arc.second)) { + } else if (can_add_arc(arc.first, arc.second)) { // Check if the arc can be added add_arc_unsafe(arc.first, arc.second); - } else { + } else { // Check if the arc can be flipped throw std::invalid_argument("Arc " + arc.first + " -> " + arc.second + " not allowed in this Bayesian network."); } diff --git a/pybnesian/opencl/opencl_config.hpp b/pybnesian/opencl/opencl_config.hpp index 565b7880..220818c9 100644 --- a/pybnesian/opencl/opencl_config.hpp +++ b/pybnesian/opencl/opencl_config.hpp @@ -515,18 +515,31 @@ void OpenCLConfig::reduction_cols_offset( } template +/** + * @brief Calculates the log(sum(exp(.))) of each column of a matrix. + * + * @param input_mat Matrix of size input_rows x input_cols + * @param input_rows Number of rows of the matrix + * @param input_cols Number of columns of the matrix + * @param output_vec Vector of size input_cols + * @param output_offset Offset of the output vector + */ void OpenCLConfig::logsumexp_cols_offset( cl::Buffer& input_mat, int input_rows, int input_cols, cl::Buffer& output_vec, int output_offset) { auto max_buffer = amax_cols(input_mat, input_rows, input_cols); + // exp(input_mat[idx] - max_buffer[col]); auto logsumexp_coeffs = kernel(OpenCL_kernel_traits::logsumexp_coeffs); logsumexp_coeffs.setArg(0, input_mat); logsumexp_coeffs.setArg(1, static_cast(input_rows)); logsumexp_coeffs.setArg(2, max_buffer); RAISE_ENQUEUEKERNEL_ERROR(m_queue.enqueueNDRangeKernel( logsumexp_coeffs, cl::NullRange, cl::NDRange(input_rows * input_cols), cl::NullRange)); + + // sum(exp(input_mat[idx] - max_buffer[col])); sum_cols_offset(input_mat, input_rows, input_cols, output_vec, static_cast(output_offset)); + // log(sum(exp(input_mat[idx] - max_buffer[col]))) + max_buffer[col]; auto finish_lse = kernel(OpenCL_kernel_traits::finish_lse_offset); finish_lse.setArg(0, output_vec); finish_lse.setArg(1, static_cast(output_offset)); diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp index 96a37ecb..96a75d67 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_algorithms.cpp @@ -103,7 +103,7 @@ Executes a greedy hill-climbing algorithm. This calls :func:`GreedyHillClimbing. "bic" for :class:`BIC `, "bge" for :class:`BGe `, "cv-lik" for :class:`CVLikelihood `, "holdout-lik" for - :class:`HoldoutLikelihood `, "validated-lik for + :class:`HoldoutLikelihood `, "validated-lik" for :class:`ValidatedLikelihood `. :param operators: Set of operators in the search process. :param arc_blacklist: List of arcs blacklist (forbidden arcs). diff --git a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp index 481922d5..68450aaf 100644 --- a/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp +++ b/pybnesian/pybindings/pybindings_learning/pybindings_independences.cpp @@ -7,13 +7,15 @@ #include #include #include +#include #include namespace py = pybind11; using learning::independences::IndependenceTest, learning::independences::continuous::LinearCorrelation, learning::independences::continuous::KMutualInformation, learning::independences::continuous::RCoT, - learning::independences::discrete::ChiSquare, learning::independences::hybrid::MutualInformation; + learning::independences::discrete::ChiSquare, learning::independences::hybrid::MutualInformation, + learning::independences::hybrid::MixedKMutualInformation; using learning::independences::DynamicIndependenceTest, learning::independences::continuous::DynamicLinearCorrelation, learning::independences::continuous::DynamicKMutualInformation, learning::independences::continuous::DynamicRCoT, @@ -524,5 +526,110 @@ The dynamic adaptation of the :class:`ChiSquare` independence test. Initializes a :class:`DynamicChiSquare` with the given :class:`DynamicDataFrame` ``df``. :param ddf: :class:`DynamicDataFrame` to create the :class:`DynamicChiSquare`. +)doc"); + + py::class_>( + root, "MixedKMutualInformation", R"doc( +This class implements a non-parametric independence test that is based on the estimation of the mutual information +using k-nearest neighbors, accelerated using vantage-point trees (VP-Trees). This independence is implemented for a mix of categorical and continuous data. + +This independence test is based on both [MSCMI]_ and [MixedCMIKnn]_. +)doc") + .def(py::init([](DataFrame df, + int k, + std::optional seed, + int shuffle_neighbors, + int samples, + std::string scaling, + bool gamma_approx, + bool adaptive_k, + int tree_leafsize) { + if (scaling != "normalized_rank" && scaling != "min_max") { + throw std::invalid_argument("scaling must be either 'min_max' or 'normalized_rank'"); + } + return MixedKMutualInformation(df, + k, + random_seed_arg(seed), + shuffle_neighbors, + samples, + scaling, + gamma_approx, + adaptive_k, + tree_leafsize); + }), + py::arg("df"), + py::arg("k") = 10, + py::arg("seed") = std::nullopt, + py::arg("shuffle_neighbors") = 5, + py::arg("samples") = 1000, + py::arg("scaling") = "min_max", + py::arg("gamma_approx") = true, + py::arg("adaptive_k") = true, + py::arg("tree_leafsize") = 16, + R"doc( +Initializes a :class:`MixedKMutualInformation` for data ``df``. ``k`` is the number of neighbors in the k-nn model used to +estimate the mutual information. + +This is a permutation independence test, so ``samples`` defines the number of permutations. ``shuffle neighbors`` +(:math:`k_{perm}` in the original paper [MixedCMIKnn]_) defines how many neighbors are used to perform the conditional +permutations. ``adaptive k`` enforces an upper bound for both ``k`` and ``shuffle neighbors``, so they are not greater +than the smallest cluster size (discrete configuration), as suggested in [MixedCMIKnn]_. + +:param df: DataFrame on which to calculate the independence tests. +:param k: number of neighbors in the k-nn model used to estimate the mutual information. +:param seed: A random seed number. If not specified or ``None``, a random seed is generated. +:param shuffle_neighbors: Number of neighbors used to perform the conditional permutation. +:param samples: Number of permutations for the :class:`MixedKMutualInformation`. +:param scaling: Transformation for the continuous variables to the [0,1] range. Can be either "min_max" or "normalized_rank". +:param gamma_approx: Whether or not to approximate the p-value by fitting a gamma distribution with the first three moments of the permutation statistics. +:param adaptive_k: If set to ``True``, upper bounds both ``k`` and ``shuffle neighbors`` to the minimum discrete configuration size, as in [MixedCMIKnn]_. If set to ``False``, + allows the k-nn model to consider dependencies between distinct discrete values, and is more biased towards zero estimates as in [MSCMI]_. +:param tree_leafsize: Maximum size for the VP-Tree leaves to abandon pruning for a brute force approach. + +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, const std::string& x, const std::string& y) { return self.mi(x, y); }, + py::arg("x"), + py::arg("y"), + R"doc( +Estimates the unconditional mutual information :math:`\text{MI}(x, y)`. + +:param x: A variable name. +:param y: A variable name. +:returns: The unconditional mutual information :math:`\text{MI}(x, y)`. +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, const std::string& x, const std::string& y, const std::string& z) { + return self.mi(x, y, z); + }, + py::arg("x"), + py::arg("y"), + py::arg("z"), + R"doc( +Estimates the univariate conditional mutual information :math:`\text{MI}(x, y \mid z)`. + +:param x: A variable name. +:param y: A variable name. +:param z: A variable name. +:returns: The univariate conditional mutual information :math:`\text{MI}(x, y \mid z)`. +)doc") + .def( + "mi", + [](MixedKMutualInformation& self, + const std::string& x, + const std::string& y, + const std::vector& z) { return self.mi(x, y, z); }, + py::arg("x"), + py::arg("y"), + py::arg("z"), + R"doc( +Estimates the multivariate conditional mutual information :math:`\text{MI}(x, y \mid \mathbf{z})`. + +:param x: A variable name. +:param y: A variable name. +:param z: A list of variable names. +:returns: The multivariate conditional mutual information :math:`\text{MI}(x, y \mid \mathbf{z})`. )doc"); } diff --git a/pybnesian/util/basic_eigen_ops.hpp b/pybnesian/util/basic_eigen_ops.hpp index a1ac9874..11ae49b0 100644 --- a/pybnesian/util/basic_eigen_ops.hpp +++ b/pybnesian/util/basic_eigen_ops.hpp @@ -1,7 +1,9 @@ #ifndef PYBNESIAN_UTIL_BASIC_EIGEN_OPS_HPP #define PYBNESIAN_UTIL_BASIC_EIGEN_OPS_HPP +#include #include +#include namespace util { @@ -132,18 +134,31 @@ Matrix sqrt_matrix(const M& m) { } // Checks whether M is positive definite. +/** + * @brief Checks whether a matrix M is positive definite. + * + * @tparam M Matrix type. + * @param m Matrix to check. + * @return true If M is positive definite. + * @return false If M is not positive definite. + */ template -bool is_psd(const M& m) { +bool is_psd(const M& m, int verbose = 0) { using MatrixType = Matrix; - Eigen::SelfAdjointEigenSolver eigen_solver(m, Eigen::EigenvaluesOnly); - auto tol = eigen_solver.eigenvalues().maxCoeff() * m.rows() * std::numeric_limits::epsilon(); + Eigen::LLT lltOfM(m); // compute the Cholesky decomposition of m - if (eigen_solver.eigenvalues().minCoeff() < tol) { + if (lltOfM.info() == Eigen::NumericalIssue) { + std::stringstream ss; + ss << "basic_eigen_ops.hpp::is_psd:\t" + << "C++ Matrix m:\n" + << m << "\nCHOLESKY: Possibly non semi-positive definite matrix!"; + std::string log_str = ss.str(); + util::formatted_log_t(verbose, log_str); return false; + } else { + return true; } - - return true; } } // namespace util diff --git a/pybnesian/util/progress.hpp b/pybnesian/util/progress.hpp index 31038d13..fe0f8ec5 100644 --- a/pybnesian/util/progress.hpp +++ b/pybnesian/util/progress.hpp @@ -2,9 +2,44 @@ #define PYBNESIAN_UTIL_PROGRESS_HPP #include +// #include +// #include // sudo apt install libboost-all-dev +#include +// enum class log_level_t { LOG_NOTHING, LOG_CRITICAL, LOG_ERROR, LOG_WARNING, LOG_INFO, LOG_DEBUG }; namespace util { +// auto GLOBAL_LEVEL = log_level_t::LOG_INFO; +class formatted_log_t { +public: + formatted_log_t(int verbose_level, std::string msg) : verbose_level(verbose_level), msg(msg) {} + ~formatted_log_t() { + // GLOBAL_LEVEL is a global variable and could be changed at runtime + // Any customization could be here + // if (level <= GLOBAL_LEVEL) + // std::wcout << static_cast(level) << L" " << fmt.str() + // << std::endl; // Convert level to a string before printing + if (verbose_level > 0) { + std::cout << msg << std::endl; + } + } + // template + // formatted_log_t& operator%(T value) { + // fmt % value; + // return *this; + // } + // formatted_log_t log(int verbose_level, const char* msg) { return formatted_log_t(verbose_level, msg); } + +protected: + int verbose_level; + std::string msg; +}; +// Helper function. Class formatted_log_t will not be used directly. +// template +// formatted_log_t log(const char* msg) { +// return formatted_log_t(verbose_level, msg); +// } + class BaseIndeterminateSpinner { public: virtual ~BaseIndeterminateSpinner() {} @@ -65,6 +100,14 @@ class IndeterminateSpinner : public BaseIndeterminateSpinner { indicators::ProgressSpinner m_spinner; }; +/** + * @brief Creates a spinner based on the verbose level. + * + * @tparam Args Arguments to pass to the spinner. + * @param verbose_level 0: no spinner, 1: indeterminate spinner + * @param additional_args Additional arguments to pass to the spinner. + * @return std::unique_ptr Pointer to the spinner. + */ template std::unique_ptr indeterminate_spinner(int verbose_level, Args&&... additional_args) { switch (verbose_level) { diff --git a/pybnesian/util/validate_options.cpp b/pybnesian/util/validate_options.cpp index f524b456..10eb89b6 100644 --- a/pybnesian/util/validate_options.cpp +++ b/pybnesian/util/validate_options.cpp @@ -13,13 +13,25 @@ using models::GaussianNetworkType, models::KDENetworkType, models::Semiparametri namespace util { +/** + * @brief Checks if the given score is valid for the given Bayesian network type e.g., "bic","bge, "cv-lik", + * "holdout-lik", "validated-lik". + * + * @param df + * @param bn_type + * @param score + * @param seed + * @param num_folds + * @param test_holdout_ratio + * @return std::unique_ptr + */ std::unique_ptr check_valid_score(const DataFrame& df, const BayesianNetworkType& bn_type, const std::optional& score, int seed, int num_folds, double test_holdout_ratio) { - if (score) { + if (score) { // If score is specified if (*score == "bic") return std::make_unique(df); if (*score == "bge") return std::make_unique(df); if (*score == "cv-lik") return std::make_unique(df, num_folds, seed); @@ -33,17 +45,30 @@ std::unique_ptr check_valid_score(const DataFrame& df, "\"bic\" (Bayesian Information Criterion), \"bge\" (Bayesian Gaussian equivalent), " "\"cv-lik\" (Cross-Validated likelihood), \"holdout-l\" (Hold-out likelihood) " " or \"validated-lik\" (Validated likelihood with cross-validation)."); - } else { + } else { // If score is not specified if (bn_type == GaussianNetworkType::get_ref()) { - return std::make_unique(df); + return std::make_unique(df); // Default score for GaussianNetworkType } else if (bn_type == SemiparametricBNType::get_ref() || bn_type == KDENetworkType::get_ref()) { - return std::make_unique(df, test_holdout_ratio, num_folds, seed); + return std::make_unique( + df, test_holdout_ratio, num_folds, seed); // Default score for SemiparametricBNType and KDENetworkType } else { throw std::invalid_argument("Default score not defined for " + bn_type.ToString() + "."); } } } +/** + * @brief Checks if the given operators are valid for the given Bayesian network type ["arcs", "node_type"]. + * Otherwise, it returns the default operators for the given Bayesian network type + * + * @param bn_type + * @param operators + * @param arc_blacklist + * @param arc_whitelist + * @param max_indegree + * @param type_whitelist + * @return std::shared_ptr + */ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn_type, const std::optional>& operators, const ArcStringVector& arc_blacklist, @@ -52,7 +77,7 @@ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn const FactorTypeVector& type_whitelist) { std::vector> res; - if (operators && !operators->empty()) { + if (operators && !operators->empty()) { // If operators are specified for (auto& op : *operators) { if (op == "arcs") { res.push_back(std::make_shared(arc_blacklist, arc_whitelist, max_indegree)); @@ -71,7 +96,7 @@ std::shared_ptr check_valid_operators(const BayesianNetworkType& bn "\"arcs\" (Changes in arcs; addition, removal and flip) or " "\"node_type\" (Change of node type)"); } - } else { + } else { // If operators are not specified if (bn_type == GaussianNetworkType::get_ref()) res.push_back(std::make_shared(arc_blacklist, arc_whitelist, max_indegree)); else if (bn_type == SemiparametricBNType::get_ref()) { diff --git a/pybnesian/util/validate_whitelists.hpp b/pybnesian/util/validate_whitelists.hpp index 81976e03..c06880b9 100644 --- a/pybnesian/util/validate_whitelists.hpp +++ b/pybnesian/util/validate_whitelists.hpp @@ -151,6 +151,15 @@ ListRestrictions validate_restrictions(const Model& g, return r; } +/** + * @brief Validate the arc restrictions for a model. + * + * @tparam Model + * @param g + * @param varc_blacklist + * @param varc_whitelist + * @return ListRestrictions + */ template ListRestrictions validate_restrictions(const Model& g, const ArcStringVector& varc_blacklist, @@ -181,6 +190,14 @@ ListRestrictions validate_restrictions(const Model& g, return r; } +/** + * @brief Validates the type restrictions for a model. + * + * @tparam Model + * @param g + * @param type_blacklist + * @param type_whitelist + */ template void validate_type_restrictions(const Model& g, const FactorTypeVector& type_blacklist, diff --git a/pybnesian/vptree/vptree.cpp b/pybnesian/vptree/vptree.cpp new file mode 100644 index 00000000..8d1ad73c --- /dev/null +++ b/pybnesian/vptree/vptree.cpp @@ -0,0 +1,764 @@ +#include +#include + +namespace vptree { + +template +using Neighbor = std::pair; + +template +struct NeighborComparator { + inline bool operator()(const Neighbor& a, const Neighbor& b) { + return a.first < b.first; // max-heap + } +}; + +template +using NeighborQueue = + std::priority_queue, std::vector>, NeighborComparator>; + +template +struct QueryNode { + VPTreeNode* node; + typename ArrowType::c_type min_distance; +}; + +template +struct QueryNodeComparator { + inline bool operator()(const QueryNode& a, const QueryNode& b) { + return a.min_distance > b.min_distance; // closer neighbors are visited first + } +}; + +template +using QueryQueue = + std::priority_queue, std::vector>, QueryNodeComparator>; + +template +std::unique_ptr build_vptree(const HybridChebyshevDistance& distance, + std::vector& indices_parent, + int leafsize, + Random& rng) { + using CType = typename ArrowType::c_type; + + // ending conditions of the recursion + if (indices_parent.empty()) return nullptr; + + if (indices_parent.size() <= static_cast(leafsize)) { + auto leaf = std::make_unique(); + leaf->threshold = 0.0; + leaf->is_leaf = true; + leaf->leaf_indices = indices_parent; + return leaf; + } + + size_t rand_selection = std::uniform_int_distribution(0, indices_parent.size() - 1)(rng); + std::iter_swap(indices_parent.begin() + rand_selection, indices_parent.begin()); + size_t vp_index = indices_parent[0]; + + std::vector> distances_indices(indices_parent.size() - 1); + + CType max = 0; + + // compute distances against the vantange point + for (size_t i = 1; i < indices_parent.size(); ++i) { + auto dist = distance.distance(indices_parent[i], vp_index); + distances_indices[i - 1] = std::make_pair(dist, indices_parent[i]); + if (dist > max) max = dist; + } + + // super-leaf for configurations where all points overlap + if (max == 0) { + auto leaf = std::make_unique(); + leaf->threshold = 0.0; + leaf->is_leaf = true; + leaf->leaf_indices = indices_parent; + + return leaf; + } + + auto it = std::find_if(distances_indices.begin(), distances_indices.end(), [](const std::pair& p) { + return p.first == std::numeric_limits::infinity(); // Check if any distance is infinity + }); + + // prioritize discrete splits + CType threshold = 1.0; + + if (it == distances_indices.end()) { + // if none, node radius is the median + std::nth_element( + distances_indices.begin(), + distances_indices.begin() + distances_indices.size() / 2, + distances_indices.end(), + [](const std::pair& a, const std::pair& b) { return a.first > b.first; }); + threshold = distances_indices[distances_indices.size() / 2].first; + } + + std::vector indices_left, indices_right; + + // follow convention for left child, contains neighbors within the radius and on the hypersphere surface + for (size_t i = 0; i < distances_indices.size(); ++i) { + if (distances_indices[i].first <= threshold) { + indices_left.push_back(distances_indices[i].second); + } else { + indices_right.push_back(distances_indices[i].second); + } + } + + auto node = std::make_unique(); + + node->index = vp_index; + node->threshold = threshold; + node->is_leaf = false; + + node->left = build_vptree(distance, indices_left, leafsize, rng); + node->right = build_vptree(distance, indices_right, leafsize, rng); + + return node; +} + +std::unique_ptr VPTree::build_vptree(const DataFrame& df, + const std::shared_ptr datatype, + const std::vector& is_discrete_column, + int leafsize, + unsigned int seed) { + std::vector indices(m_df->num_rows()); + std::iota(indices.begin(), indices.end(), 0); + std::mt19937 rng{seed}; + + switch (datatype->id()) { + case Type::DOUBLE: { + auto data = df.downcast_vector(); + + HybridChebyshevDistance distance(data, is_discrete_column); + return vptree::build_vptree(distance, indices, leafsize, rng); + } + case Type::FLOAT: { + auto data = df.downcast_vector(); + + HybridChebyshevDistance distance(data, is_discrete_column); + return vptree::build_vptree(distance, indices, leafsize, rng); + } + default: + throw std::invalid_argument("Wrong data type to apply VPTree."); + } +} + +std::vector> VPTree::query(const DataFrame& test_df, int k) const { + if (k >= m_df->num_rows()) { + throw std::invalid_argument("\"k\" value equal or greater to training data size."); + } + + test_df.raise_has_columns(m_column_names); + + std::vector> res(test_df->num_rows()); + + // only for fully discrete data + auto cache_values = + std::all_of(m_is_discrete_column.begin(), m_is_discrete_column.end(), [](bool val) { return val; }); + + auto num_rows = test_df->num_rows(); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance dist(test, m_is_discrete_column); + auto hash_keys = hash_columns(test, m_column_names, cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < num_rows; ++i) { + auto key = hash_keys[i]; + bool skip_query = false; +#pragma omp critical + { + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + skip_query = true; + res[i] = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto t = query_instance(i, k, dist); + res[i] = t; + +#pragma omp critical + { + m_query_cache[key] = t; + } + } + } + + else { +#pragma omp for + for (int i = 0; i < num_rows; ++i) { + auto t = query_instance(i, k, dist); + res[i] = t; + } + } + } + + break; + } + + default: { + auto test = test_df.downcast_vector(); + + HybridChebyshevDistance dist(test, m_is_discrete_column); + auto hash_keys = hash_columns(test, m_column_names, cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < num_rows; ++i) { + auto key = hash_keys[i]; + bool skip_query = false; +#pragma omp critical + { + auto it = m_query_cache.find(key); + if (it != m_query_cache.end()) { + skip_query = true; + res[i] = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto t = query_instance(i, k, dist); + res[i] = t; + +#pragma omp critical + { + m_query_cache[key] = t; + } + } + } else { +#pragma omp for + for (int i = 0; i < num_rows; ++i) { + auto t = query_instance(i, k, dist); + res[i] = t; + } + } + } + } + } + + if (cache_values) { + // cleared because after permuting X the XYZ space will not be the same + m_query_cache.clear(); + } + + return res; +} +std::tuple VPTree::count_ball_subspaces(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const { + test_df.raise_has_columns(m_column_names); + + auto n_rows = test_df->num_rows(); + VectorXi count_xz(n_rows); + VectorXi count_yz(n_rows); + VectorXi count_z(n_rows); + + // only for fully discrete data + auto cache_values = std::all_of(is_discrete_column.begin(), is_discrete_column.end(), [](bool val) { return val; }); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance_xyz(test, is_discrete_column); + + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + skip_query = true; + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + +#pragma omp critical + { + m_count_cache[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + } + } + } + + break; + } + default: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance_xyz(test, is_discrete_column); + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache.find(key); + if (it != m_count_cache.end()) { + skip_query = true; + count_xz(i) = std::get<0>(it->second); + count_yz(i) = std::get<1>(it->second); + count_z(i) = std::get<2>(it->second); + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + +#pragma omp critical + { + m_count_cache[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_subspaces_instance(i, eps(i), distance_xyz); + + count_xz(i) = std::get<0>(c); + count_yz(i) = std::get<1>(c); + count_z(i) = std::get<2>(c); + } + } + } + } + } + + if (cache_values) { + // cleared because after permuting X the XYZ space will not be the same + m_count_cache.clear(); + } + + return std::make_tuple(count_xz, count_yz, count_z); +} + +template +std::vector vptree::hash_columns( + const std::vector::ArrayType>>& data, + std::vector column_names, + bool discrete_data) { + int num_rows = data.empty() ? 0 : data[0]->length(); + std::vector row_hashes(num_rows, 0); + + if (discrete_data) { + size_t colnames_hash = boost::hash_range(column_names.begin(), column_names.end()); +#pragma omp parallel for + for (int i = 0; i < num_rows; ++i) { + size_t h = 0; // local hash for row i + for (size_t j = 0; j < data.size(); ++j) { + auto value = data[j]->Value(i); + boost::hash_combine(h, value); + } + boost::hash_combine(h, colnames_hash); + row_hashes[i] = h; + } + } + + return row_hashes; +} + +VectorXi VPTree::count_ball_unconditional(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const { + test_df.raise_has_columns(m_column_names); + + auto n_rows = test_df->num_rows(); + VectorXi count_n(n_rows); + + // only for fully discrete data + auto cache_values = std::all_of(is_discrete_column.begin(), is_discrete_column.end(), [](bool val) { return val; }); + + switch (m_datatype->id()) { + case Type::FLOAT: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance(test, is_discrete_column); + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + skip_query = true; + count_n(i) = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + +#pragma omp critical + { + m_count_cache_unconditional[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + } + } + } + + break; + } + default: { + auto test = test_df.downcast_vector(); + HybridChebyshevDistance distance(test, is_discrete_column); + auto hash_keys = hash_columns(test, test_df.column_names(), cache_values); +#pragma omp parallel + { + if (cache_values) { +#pragma omp for schedule(dynamic) + for (int i = 0; i < n_rows; ++i) { + auto key = hash_keys[i]; + + boost::hash_combine(key, eps(i)); + bool skip_query = false; +#pragma omp critical + { + auto it = m_count_cache_unconditional.find(key); + if (it != m_count_cache_unconditional.end()) { + skip_query = true; + count_n(i) = it->second; + } + } + if (skip_query) { + continue; // Skip the query, use cached result + } + + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; +#pragma omp critical + { + m_count_cache_unconditional[key] = c; + } + } + } else { +#pragma omp for + for (int i = 0; i < n_rows; ++i) { + auto c = count_ball_unconditional_instance(i, eps(i), distance); + + count_n(i) = c; + } + } + } + } + } + + /*here we do not clear the cache since the Y subspace will not be permuted, + and recycled yTrees may benefit from it*/ + + return count_n; +} + +template +std::pair VPTree::query_instance(size_t i, + int k, + const HybridChebyshevDistance& distance) const { + using CType = typename ArrowType::c_type; + + // max-heap + NeighborQueue neighborhood; + + // list at the top of the max-heap, that allows storing neighbors tying at the knn distance + std::pair> neighborhood_star; + + CType distance_upper_bound = neighborhood_star.first = std::numeric_limits::infinity(), distance_neigh = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + CType min_distance = 0; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + distance_neigh = distance.distance(*it_neigh, i); + + if (neighborhood.size() == static_cast(k)) { + if (distance_neigh < distance_upper_bound) { + neighborhood.pop(); + neighborhood.push(std::make_pair(distance_neigh, *it_neigh)); + // check tying neighbors are still equal to the knn + if (neighborhood_star.first > neighborhood.top().first) { + neighborhood_star.second.clear(); + } + } else if (distance_neigh == distance_upper_bound) { + // process super-leaf values as one + if (num_neighbors > static_cast(m_leafsize)) { + neighborhood_star.second.reserve(neighborhood_star.second.size() + + std::distance(it_neigh, neigh_end)); + neighborhood_star.second.insert(neighborhood_star.second.end(), it_neigh, neigh_end); + neighborhood_star.first = distance_neigh; + break; + } else { + neighborhood_star.second.push_back(*it_neigh); + neighborhood_star.first = distance_neigh; + } + // process super-leaf values as one + } else if (num_neighbors > static_cast(m_leafsize)) + break; + } else { + neighborhood.push(std::make_pair(distance_neigh, *it_neigh)); + } + + if (neighborhood.size() == static_cast(k)) { + distance_upper_bound = neighborhood.top().first; + } + } + + // use triangular inequality to prune branches + CType left_min_distance = distance_neigh - node->threshold; + + if (node->left && left_min_distance <= distance_upper_bound) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - distance_neigh; + + if (node->right && right_min_distance <= distance_upper_bound) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + auto k_hat = k + neighborhood_star.second.size(); + VectorXd distances(k); // just size k since the tying neighbors all share the same knn distance + VectorXi indices(k_hat); + + std::copy(neighborhood_star.second.begin(), + neighborhood_star.second.end(), + indices.data() + (k_hat - neighborhood_star.second.size())); + + auto u = k - 1; + while (!neighborhood.empty()) { + auto& neigh = neighborhood.top(); + distances(u) = neigh.first; + indices(u) = neigh.second; + neighborhood.pop(); + --u; + } + + return std::make_pair(distances, indices); +} + +template +std::tuple VPTree::count_ball_subspaces_instance( + size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance_xyz) const { + using CType = typename ArrowType::c_type; + + CType min_distance = 0, d_z = 0; + + int count_xz = 0, count_yz = 0, count_z = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + std::vector z_indices(m_df->num_columns()); + std::iota(z_indices.begin(), z_indices.end(), 2); + + std::vector x_index(1, 0); + std::vector y_index(1, 1); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + // trick: since Z is a subspace of XZ and YZ, we can constrain the vptree building and + // search just to Z, then check for X&Y + d_z = distance_xyz.distance_coords(*it_neigh, i, z_indices); + + if (d_z <= eps_value) { + if (num_neighbors <= static_cast(m_leafsize)) { + ++count_z; + if (distance_xyz.distance_coords(*it_neigh, i, x_index) <= eps_value) ++count_xz; + if (distance_xyz.distance_coords(*it_neigh, i, y_index) <= eps_value) ++count_yz; + } else { + // process super-leaf values as one, at least for Z + count_z += num_neighbors; + for (; it_neigh != neigh_end; ++it_neigh) { + if (distance_xyz.distance_coords(*it_neigh, i, x_index) <= eps_value) ++count_xz; + if (distance_xyz.distance_coords(*it_neigh, i, y_index) <= eps_value) ++count_yz; + } + break; + } + } else if (num_neighbors > static_cast(m_leafsize)) + // process super-leaf values as one + break; + } + + // use triangular inequality to prune branches + CType left_min_distance = d_z - node->threshold; + + if (node->left && left_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - d_z; + + if (node->right && right_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + return std::make_tuple(count_xz, count_yz, count_z); +} + +template +int VPTree::count_ball_unconditional_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const { + using CType = typename ArrowType::c_type; + + CType min_distance = 0, distance_neigh = 0; + + int count_n = 0; + + // iterative approach that avoid recursion overhead + QueryQueue query_nodes; + + // start at the root node + query_nodes.push(QueryNode{m_root.get(), min_distance}); + + while (!query_nodes.empty()) { + auto& query = query_nodes.top(); + auto node = query.node; + + query_nodes.pop(); + + std::vector eval_neighbors(1, node->index); + + if (node->is_leaf) { + eval_neighbors = node->leaf_indices; + } + + auto num_neighbors = eval_neighbors.size(); + + for (auto it_neigh = eval_neighbors.begin(), neigh_end = eval_neighbors.end(); it_neigh != neigh_end; + ++it_neigh) { + distance_neigh = distance.distance(*it_neigh, i); + + if (distance_neigh <= eps_value) { + if (num_neighbors <= static_cast(m_leafsize)) { + ++count_n; + } else { + // process super-leaf values as one + count_n += num_neighbors; + break; + } + } else if (num_neighbors > static_cast(m_leafsize)) + // process super-leaf values as one + break; + } + + // use triangular inequality to prune branches + CType left_min_distance = distance_neigh - node->threshold; + + if (node->left && left_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->left.get(), left_min_distance}); + } + + CType right_min_distance = node->threshold - distance_neigh; + + if (node->right && right_min_distance <= eps_value) { + query_nodes.push(QueryNode{node->right.get(), right_min_distance}); + } + } + + return count_n; +} + +} // namespace vptree diff --git a/pybnesian/vptree/vptree.hpp b/pybnesian/vptree/vptree.hpp new file mode 100644 index 00000000..430a1128 --- /dev/null +++ b/pybnesian/vptree/vptree.hpp @@ -0,0 +1,149 @@ +#ifndef PYBNESIAN_VPTREE_HPP +#define PYBNESIAN_VPTREE_HPP + +#include +#include +#include +#include +#include + +using dataset::DataFrame; +using Eigen::Matrix, Eigen::VectorXd, Eigen::VectorXi; + +namespace vptree { + +template +std::vector hash_columns( + const std::vector::ArrayType>>& data, + std::vector column_names, + bool discrete_data); + +template +class HybridChebyshevDistance { +public: + using CType = typename ArrowType::c_type; + using ArrayType = typename arrow::TypeTraits::ArrayType; + using OperationFunc = std::function; + + HybridChebyshevDistance(const std::vector>& data, + const std::vector& is_discrete_column) + : m_data(data) { + m_operations_coords.reserve(m_data.size()); + for (size_t i = 0; i < m_data.size(); ++i) { + if (is_discrete_column[i]) { + // For discrete columns, Hamming {0,inf} distance + m_operations_coords.push_back([this, i](size_t p1_index, size_t p2_index) -> CType { + return (m_data[i]->Value(p1_index) != m_data[i]->Value(p2_index)) ? std::numeric_limits::infinity() : 0.0; + }); + } else { + // For continuous columns, Manhattan distance + m_operations_coords.push_back([this, i](size_t p1_index, size_t p2_index) -> CType { + return std::abs(m_data[i]->Value(p1_index) - m_data[i]->Value(p2_index)); + }); + } + } + } + + inline CType distance(size_t p1_index, size_t p2_index) const { + CType d = 0; + for (auto it_operation = m_operations_coords.begin(), it_end = m_operations_coords.end(); + it_operation != it_end; + ++it_operation) { + d = std::max(d, (*it_operation)(p1_index, p2_index)); + } + + return d; + } + + inline CType distance_coords(size_t p1_index, size_t p2_index, std::vector& coords) const { + CType d = 0; + for (auto it_col_idx = coords.begin(); it_col_idx != coords.end(); it_col_idx++) { + d = std::max(d, m_operations_coords[*it_col_idx](p1_index, p2_index)); + } + + return d; + } + +private: + const std::vector>& m_data; + std::vector m_operations_coords; +}; + +struct VPTreeNode { + size_t index; + double threshold; + std::unique_ptr left; + std::unique_ptr right; + std::vector leaf_indices; + bool is_leaf; +}; + +class VPTree { +public: + VPTree(DataFrame& df, + std::shared_ptr datatype, + std::vector& is_discrete_column, + int leafsize = 16, + unsigned int seed = std::random_device{}()) + : m_df(df), + m_datatype(datatype), + m_is_discrete_column(is_discrete_column), + m_column_names(df.column_names()), + m_root(), + m_leafsize(leafsize), + m_seed(seed), + m_query_cache(), + m_count_cache(), + m_count_cache_unconditional() { + m_root = build_vptree(m_df, m_datatype, m_is_discrete_column, m_leafsize, m_seed); + } + + std::vector> query(const DataFrame& test_df, int k) const; + + std::tuple count_ball_subspaces(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const; + + VectorXi count_ball_unconditional(const DataFrame& test_df, + const VectorXd& eps, + std::vector& is_discrete_column) const; + + const DataFrame& scaled_data() const { return m_df; } + +private: + std::unique_ptr build_vptree(const DataFrame& df, + const std::shared_ptr datatype, + const std::vector& is_discrete_column, + int leafsize, + unsigned int seed); + + template + std::pair query_instance(size_t i, + int k, + const HybridChebyshevDistance& distance) const; + + template + std::tuple count_ball_subspaces_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const; + + template + int count_ball_unconditional_instance(size_t i, + const typename ArrowType::c_type eps_value, + const HybridChebyshevDistance& distance) const; + + DataFrame& m_df; + std::shared_ptr m_datatype; + std::vector& m_is_discrete_column; + std::vector m_column_names; + std::unique_ptr m_root; + int m_leafsize; + unsigned int m_seed; + mutable std::unordered_map> m_query_cache; + mutable std::unordered_map> m_count_cache; + mutable std::unordered_map m_count_cache_unconditional; +}; + +} // namespace vptree + +#endif // PYBNESIAN_VPTREE_HPP \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1be96df0..4d4b6f57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,10 +13,13 @@ sdist.exclude = ["vcpkg/*", "docs/"] [project] name = "pybnesian" -authors = [{name = "David Atienza", email = "datienza@fi.upm.es"}] -description="PyBNesian is a Python package that implements Bayesian networks." -version = "0.5.1" -readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "David Atienza", email = "datienza@fi.upm.es" }, + { name = "Carlos Li Hu", email = "carloslihu96@gmail.com" }, +] +description = "PyBNesian is a Python package that implements Bayesian networks." +version = "0.5.2" +readme = { file = "README.md", content-type = "text/markdown" } license = { file = "LICENSE" } requires-python = ">=3.8" classifiers = [ @@ -24,35 +27,28 @@ classifiers = [ "Programming Language :: C++", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Topic :: Scientific/Engineering :: Artificial Intelligence" + "Topic :: Scientific/Engineering :: Artificial Intelligence", ] keywords = [] -dependencies = [ - "pybind11>=2.6", - "pyarrow>=14", - "numpy" -] +dependencies = ["pybind11>=2.6", "pyarrow>=14", "numpy"] [project.urls] -homepage = "https://github.com/davenza/PyBNesian" # FIXME not shown by pip +homepage = "https://github.com/davenza/PyBNesian" # FIXME not shown by pip documentation = "https://pybnesian.readthedocs.io/en/latest/?badge=latest" changelog = "https://pybnesian.readthedocs.io/en/latest/changelog.html" [tool.cibuildwheel] -skip=["pp*", - "*-win32", - "*-musllinux*", - "*i686*", - "*ppc64le*", - "*s390x*"] +skip = ["pp*", "*-win32", "*-musllinux*", "*i686*", "*ppc64le*", "*s390x*"] [tool.cibuildwheel.linux] before-all = "yum install -y zip unzip kernel-headers perl-IPC-Cmd flex opencl-headers ocl-icd ocl-icd-devel" [tool.cibuildwheel.macos] -before-all = ["sudo xcodebuild -runFirstLaunch", - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer", - "brew install bison ninja", - "export CMAKE_GENERATOR=Xcode", - "export MACOSX_DEPLOYMENT_TARGET=10.14", - "export VCPKG_ENV_PASSTHROUGH=MACOSX_DEPLOYMENT_TARGET"] \ No newline at end of file +before-all = [ + "sudo xcodebuild -runFirstLaunch", + "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer", + "brew install bison ninja", + "export CMAKE_GENERATOR=Xcode", + "export MACOSX_DEPLOYMENT_TARGET=10.14", + "export VCPKG_ENV_PASSTHROUGH=MACOSX_DEPLOYMENT_TARGET", +] diff --git a/pytest.ini b/pytest.ini index 31361951..9ee7c354 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] testpaths = tests -norecursedirs=tests/helpers \ No newline at end of file +norecursedirs = tests/helpers vcpkg +addopts = -s \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index e6567dda..8282ce59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ # Solution to import helper script by this answer https://stackoverflow.com/a/33515264 import os import sys -sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "tests"))) diff --git a/tests/dataset/crossvalidation_test.py b/tests/dataset/crossvalidation_test.py index 572314d8..7bae2954 100644 --- a/tests/dataset/crossvalidation_test.py +++ b/tests/dataset/crossvalidation_test.py @@ -1,11 +1,9 @@ import numpy as np import pybnesian as pbn +from helpers.data import DATA_SIZE, generate_normal_data -import util_test +df = generate_normal_data(DATA_SIZE) -SIZE = 10000 - -df = util_test.generate_normal_data(SIZE) def test_cv_disjoint_indices(): cv = pbn.CrossValidation(df) @@ -14,17 +12,29 @@ def test_cv_disjoint_indices(): nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) - - assert np.all(np.sort(combination) == np.arange(SIZE)), "Not all the examples are included in the cross validation." - assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" + assert np.all( + np.sort(combination) == np.arange(DATA_SIZE) + ), "Not all the examples are included in the cross validation." + assert np.all( + train_df.to_pandas().to_numpy() == df.iloc[train_indices, :].to_numpy() + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + assert np.all( + test_df.to_pandas().to_numpy() == df.iloc[test_indices, :].to_numpy() + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" def test_cv_fold(): @@ -33,35 +43,48 @@ def test_cv_fold(): for i, (train_df, test_df) in enumerate(cv): train_fold, test_fold = cv.fold(i) - assert train_fold.equals(train_df), "Train DataFrame fold() and __iter__ are not equal." - assert test_fold.equals(test_df), "Test DataFrame fold() and __iter__ are not equal." + assert train_fold.equals( + train_df + ), "Train DataFrame fold() and __iter__ are not equal." + assert test_fold.equals( + test_df + ), "Test DataFrame fold() and __iter__ are not equal." def test_cv_seed(): cv = pbn.CrossValidation(df, seed=0) - + dataframes = list(cv) cv2 = pbn.CrossValidation(df, seed=0) for (train_cv, test_cv), (train_cv2, test_cv2) in zip(dataframes, cv2): - assert train_cv.equals(train_cv2), "Train CV DataFrames with the same seed are not equal." - assert test_cv.equals(test_cv2), "Test CV DataFrames with the same seed are not equal." + assert train_cv.equals( + train_cv2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_cv.equals( + test_cv2 + ), "Test CV DataFrames with the same seed are not equal." cv3 = pbn.CrossValidation(df, seed=1) for (train_cv2, test_cv2), (train_cv3, test_cv3) in zip(cv2, cv3): - assert not train_cv2.equals(train_cv3), "Train CV DataFrames with different seeds return the same result." - assert not test_cv2.equals(test_cv3), "Test CV DataFrames with different seeds return the same result." + assert not train_cv2.equals( + train_cv3 + ), "Train CV DataFrames with different seeds return the same result." + assert not test_cv2.equals( + test_cv3 + ), "Test CV DataFrames with different seeds return the same result." + def test_cv_num_folds(): cv = pbn.CrossValidation(df) - + dataframes = list(cv) indices = list(cv.indices()) assert len(dataframes) == 10, "Default number of folds must be 10." assert len(indices) == 10, "Default number of folds must be 10." - + cv5 = pbn.CrossValidation(df, 5) dataframes = list(cv5) indices = list(cv5.indices()) @@ -71,103 +94,180 @@ def test_cv_num_folds(): def test_cv_loc(): cv = pbn.CrossValidation(df) - - for (train_df, test_df) in cv.loc("a"): - assert train_df.num_columns == 1, "Only column \"a\" must be present in train DataFrame." - assert test_df.num_columns == 1, "Only column \"a\" must be present in test DataFrame." + + for train_df, test_df in cv.loc("A"): + assert ( + train_df.num_columns == 1 + ), 'Only column "A" must be present in train DataFrame.' + assert ( + test_df.num_columns == 1 + ), 'Only column "A" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["a"], "Only column \"a\" must be present in train DataFrame." - assert test_schema.names == ["a"], "Only column \"a\" must be present in test DataFrame." - - for (train_df, test_df) in cv.loc(1): - assert train_df.num_columns == 1, "Only column \"b\" must be present in train DataFrame." - assert test_df.num_columns == 1, "Only column \"b\" must be present in test DataFrame." + assert train_schema.names == [ + "A" + ], 'Only column "A" must be present in train DataFrame.' + assert test_schema.names == [ + "A" + ], 'Only column "A" must be present in test DataFrame.' + + for train_df, test_df in cv.loc(1): + assert ( + train_df.num_columns == 1 + ), 'Only column "B" must be present in train DataFrame.' + assert ( + test_df.num_columns == 1 + ), 'Only column "B" must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["b"], "Only column \"b\" must be present in train DataFrame." - assert test_schema.names == ["b"], "Only column \"b\" must be present in test DataFrame." - - for (train_df, test_df) in cv.loc(["b", "d"]): - assert train_df.num_columns == 2, "Only columns [\"b\", \"d\"] must be present in train DataFrame." - assert test_df.num_columns == 2, "Only column [\"b\", \"d\"] must be present in test DataFrame." + assert train_schema.names == [ + "B" + ], 'Only column "B" must be present in train DataFrame.' + assert test_schema.names == [ + "B" + ], 'Only column "B" must be present in test DataFrame.' + + for train_df, test_df in cv.loc(["B", "D"]): + assert ( + train_df.num_columns == 2 + ), 'Only columns ["B", "D"] must be present in train DataFrame.' + assert ( + test_df.num_columns == 2 + ), 'Only column ["B", "D"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["b", "d"], "Only column [\"b\", \"d\"] must be present in train DataFrame." - assert test_schema.names == ["b", "d"], "Only column [\"b\", \"d\"] must be present in test DataFrame." - - for (train_df, test_df) in cv.loc([0, 2]): - assert train_df.num_columns == 2, "Only columns [\"a\", \"c\"] must be present in train DataFrame." - assert test_df.num_columns == 2, "Only column [\"a\", \"c\"] must be present in test DataFrame." + assert train_schema.names == [ + "B", + "D", + ], 'Only column ["B", "D"] must be present in train DataFrame.' + assert test_schema.names == [ + "B", + "D", + ], 'Only column ["B", "D"] must be present in test DataFrame.' + + for train_df, test_df in cv.loc([0, 2]): + assert ( + train_df.num_columns == 2 + ), 'Only columns ["A", "C"] must be present in train DataFrame.' + assert ( + test_df.num_columns == 2 + ), 'Only column ["A", "C"] must be present in test DataFrame.' train_schema = train_df.schema test_schema = test_df.schema - assert train_schema.names == ["a", "c"], "Only column [\"a\", \"c\"] must be present in train DataFrame." - assert test_schema.names == ["a", "c"], "Only column [\"a\", \"c\"] must be present in test DataFrame." + assert train_schema.names == [ + "A", + "C", + ], 'Only column ["A", "C"] must be present in train DataFrame.' + assert test_schema.names == [ + "A", + "C", + ], 'Only column ["A", "C"] must be present in test DataFrame.' def test_cv_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan non_null = df_null.dropna() cv = pbn.CrossValidation(df_null) for (train_df, test_df), (train_indices, test_indices) in zip(cv, cv.indices()): - assert non_null.shape[0] == (train_df.num_rows + test_df.num_rows), "CV did not remove null instances correctly." + assert non_null.shape[0] == ( + train_df.num_rows + test_df.num_rows + ), "CV did not remove null instances correctly." nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) - actual_combination = np.sort(np.setdiff1d(np.arange(SIZE), np.asarray(list(set(list(a_null) + list(b_null) + list(c_null) + list(d_null)))))) - - assert np.all(np.sort(combination) == actual_combination), "Not all the examples are included in the cross validation." - assert np.all(train_df.to_pandas().to_numpy() == df.iloc[train_indices,:].to_numpy()), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - assert np.all(test_df.to_pandas().to_numpy() == df.iloc[test_indices,:].to_numpy()), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" + actual_combination = np.sort( + np.setdiff1d( + np.arange(DATA_SIZE), + np.asarray( + list(set(list(a_null) + list(b_null) + list(c_null) + list(d_null))) + ), + ) + ) + + assert np.all( + np.sort(combination) == actual_combination + ), "Not all the examples are included in the cross validation." + assert np.all( + train_df.to_pandas().to_numpy() == df.iloc[train_indices, :].to_numpy() + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + assert np.all( + test_df.to_pandas().to_numpy() == df.iloc[test_indices, :].to_numpy() + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" cv_include_null = pbn.CrossValidation(df_null, include_null=True) - for (train_df, test_df), (train_indices, test_indices) in zip(cv_include_null, cv_include_null.indices()): - assert (train_df.num_rows + test_df.num_rows) == SIZE, "CV did not remove null instances correctly." + for (train_df, test_df), (train_indices, test_indices) in zip( + cv_include_null, cv_include_null.indices() + ): + assert ( + train_df.num_rows + test_df.num_rows + ) == DATA_SIZE, "CV did not remove null instances correctly." nptrain = np.asarray(train_indices) nptest = np.asarray(test_indices) combination = np.hstack((nptrain, nptest)) train_df_mat = train_df.to_pandas().to_numpy() - train_indices_mat = df.iloc[train_indices,:].to_numpy() + train_indices_mat = df.iloc[train_indices, :].to_numpy() test_df_mat = test_df.to_pandas().to_numpy() - test_indices_mat = df.iloc[test_indices,:].to_numpy() - - assert np.all(np.sort(combination) == np.arange(SIZE)), "Not all the examples are included in the cross validation." - assert np.all(np.isnan(train_df_mat) == np.isnan(train_indices_mat)), \ - "The null values are wrongly specified in the train DataFrame." - - assert np.all(train_df_mat[~np.isnan(train_df_mat)] == train_indices_mat[~np.isnan(train_df_mat)]), \ - "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." - - assert np.all(np.isnan(test_df_mat) == np.isnan(test_indices_mat)), \ - "The null values are wrongly specified in the test DataFrame." - assert np.all(test_df_mat[~np.isnan(test_df_mat)] == test_indices_mat[~np.isnan(test_df_mat)]), \ - "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." - - assert np.setdiff1d(nptrain, nptest).shape == nptrain.shape, "The train indices includes test indices" - assert np.setdiff1d(nptest, nptrain).shape == nptest.shape, "The test indices includes train indices" - assert np.all(np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices)), "The train indices includes test indices" - assert np.all(np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices)), "The test indices includes train indices" \ No newline at end of file + test_indices_mat = df.iloc[test_indices, :].to_numpy() + + assert np.all( + np.sort(combination) == np.arange(DATA_SIZE) + ), "Not all the examples are included in the cross validation." + assert np.all( + np.isnan(train_df_mat) == np.isnan(train_indices_mat) + ), "The null values are wrongly specified in the train DataFrame." + + assert np.all( + train_df_mat[~np.isnan(train_df_mat)] + == train_indices_mat[~np.isnan(train_df_mat)] + ), "The CV iterator do not slice the train dataset exactly equal as the CV indices iterator." + + assert np.all( + np.isnan(test_df_mat) == np.isnan(test_indices_mat) + ), "The null values are wrongly specified in the test DataFrame." + assert np.all( + test_df_mat[~np.isnan(test_df_mat)] + == test_indices_mat[~np.isnan(test_df_mat)] + ), "The CV iterator do not slice the test dataset exactly equal as the CV indices iterator." + + assert ( + np.setdiff1d(nptrain, nptest).shape == nptrain.shape + ), "The train indices includes test indices" + assert ( + np.setdiff1d(nptest, nptrain).shape == nptest.shape + ), "The test indices includes train indices" + assert np.all( + np.sort(np.setdiff1d(train_indices, test_indices)) == np.sort(train_indices) + ), "The train indices includes test indices" + assert np.all( + np.sort(np.setdiff1d(test_indices, train_indices)) == np.sort(test_indices) + ), "The test indices includes train indices" diff --git a/tests/dataset/holdout_test.py b/tests/dataset/holdout_test.py index 54bb6ad5..4cf9d373 100644 --- a/tests/dataset/holdout_test.py +++ b/tests/dataset/holdout_test.py @@ -1,42 +1,57 @@ import numpy as np import pandas as pd import pybnesian as pbn +from helpers.data import DATA_SIZE, generate_normal_data -import util_test +df = generate_normal_data(DATA_SIZE) -SIZE = 10000 - -df = util_test.generate_normal_data(SIZE) def test_holdout_disjoint(): hold = pbn.HoldOut(df) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" + assert ( + train_df.num_rows + test_df.num_rows + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * df.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * df.shape[0]), "Test DataFrame do not have the expected number of instances" + assert train_df.num_rows == round( + (1 - 0.2) * df.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * df.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert df.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." - + assert ( + df.sort_values("A", axis=0) + .reset_index(drop=True) + .equals(combination.sort_values("A", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." + hold = pbn.HoldOut(df, test_ratio=0.3) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" + assert ( + train_df.num_rows + test_df.num_rows + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.3) * df.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.3 * df.shape[0]), "Test DataFrame do not have the expected number of instances" + assert train_df.num_rows == round( + (1 - 0.3) * df.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.3 * df.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert df.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(combination.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." + assert ( + df.sort_values("A", axis=0) + .reset_index(drop=True) + .equals(combination.sort_values("A", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." + def test_holdout_seed(): hold = pbn.HoldOut(df, seed=0) @@ -45,53 +60,76 @@ def test_holdout_seed(): train_df, test_df = hold.training_data(), hold.test_data() train_df2, test_df2 = hold2.training_data(), hold2.test_data() - assert train_df.equals(train_df2), "Train CV DataFrames with the same seed are not equal." - assert test_df.equals(test_df2), "Test CV DataFrames with the same seed are not equal." + assert train_df.equals( + train_df2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_df.equals( + test_df2 + ), "Test CV DataFrames with the same seed are not equal." hold3 = pbn.HoldOut(df, seed=1) train_df3, test_df3 = hold3.training_data(), hold3.test_data() - assert not train_df.equals(train_df3), "Train CV DataFrames with different seeds return the same result." - assert not test_df.equals(test_df3), "Test CV DataFrames with different seeds return the same result." + assert not train_df.equals( + train_df3 + ), "Train CV DataFrames with different seeds return the same result." + assert not test_df.equals( + test_df3 + ), "Test CV DataFrames with different seeds return the same result." + def test_holdout_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan non_null = df_null.dropna() hold = pbn.HoldOut(df_null) train_df, test_df = hold.training_data(), hold.test_data() - assert (train_df.num_rows + test_df.num_rows) == non_null.shape[0], "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * non_null.shape[0]), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * non_null.shape[0]), "Test DataFrame do not have the expected number of instances" + assert (train_df.num_rows + test_df.num_rows) == non_null.shape[ + 0 + ], "HoldOut do not have the expected number of rows" + assert train_df.num_rows == round( + (1 - 0.2) * non_null.shape[0] + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * non_null.shape[0] + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert combination.sort_values("a", axis=0).reset_index(drop=True)\ - .equals(non_null.sort_values("a", axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." + assert ( + combination.sort_values("A", axis=0) + .reset_index(drop=True) + .equals(non_null.sort_values("A", axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." hold_null = pbn.HoldOut(df_null, include_null=True) train_df, test_df = hold_null.training_data(), hold_null.test_data() - assert (train_df.num_rows + test_df.num_rows) == SIZE, "HoldOut do not have the expected number of rows" - assert train_df.num_rows == round((1-0.2) * SIZE), "Train DataFrame do not have the expected number of instances" - assert test_df.num_rows == round(0.2 * SIZE), "Test DataFrame do not have the expected number of instances" + assert ( + train_df.num_rows + test_df.num_rows + ) == DATA_SIZE, "HoldOut do not have the expected number of rows" + assert train_df.num_rows == round( + (1 - 0.2) * DATA_SIZE + ), "Train DataFrame do not have the expected number of instances" + assert test_df.num_rows == round( + 0.2 * DATA_SIZE + ), "Test DataFrame do not have the expected number of instances" combination = pd.concat([train_df.to_pandas(), test_df.to_pandas()]) - assert combination.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)\ - .equals(df.sort_values(["a", "b", "c", "d"], axis=0).reset_index(drop=True)),\ - "The combination of train and test dataset is not equal to the original DataFrame." - - \ No newline at end of file + assert ( + combination.sort_values(["A", "B", "C", "D"], axis=0) + .reset_index(drop=True) + .equals(df.sort_values(["A", "B", "C", "D"], axis=0).reset_index(drop=True)) + ), "The combination of train and test dataset is not equal to the original DataFrame." diff --git a/tests/factors/continuous/CKDE_test.py b/tests/factors/continuous/CKDE_test.py index 3e605742..afcc66ba 100644 --- a/tests/factors/continuous/CKDE_test.py +++ b/tests/factors/continuous/CKDE_test.py @@ -1,57 +1,76 @@ -import pytest import numpy as np -import pyarrow as pa import pandas as pd +import pyarrow as pa import pybnesian as pbn +import pytest +from helpers.data import DATA_SIZE, generate_normal_data from scipy.stats import gaussian_kde -from scipy.stats import norm from scipy.stats import multivariate_normal as mvn -from scipy.special import logsumexp - -import util_test +from scipy.stats import norm -SIZE = 10000 SMALL_SIZE = 10 TEST_SIZE = 50 -df = util_test.generate_normal_data(SIZE, seed=0) -df_small = util_test.generate_normal_data(SMALL_SIZE, seed=0) -df_float = df.astype('float32') -df_small_float = df_small.astype('float32') +df = generate_normal_data(DATA_SIZE, seed=0) +df_small = generate_normal_data(SMALL_SIZE, seed=0) +df_float = df.astype("float32") +df_small_float = df_small.astype("float32") + def test_variable(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.CKDE(variable, evidence) assert cpd.variable() == variable + def test_evidence(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.CKDE(variable, evidence) assert cpd.evidence() == evidence + def test_kde_data_type(): - k = pbn.CKDE("a", []) + k = pbn.CKDE("A", []) with pytest.raises(ValueError) as ex: k.data_type() - "CKDE factor not fitted" in str(ex.value) + assert "CKDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() k.fit(df_float) assert k.data_type() == pa.float32() + def test_ckde_kde_joint(): def _test_ckde_kde_joint_iter(variable, evidence, _df): cpd = pbn.CKDE(variable, evidence) cpd.fit(_df) kde_joint = cpd.kde_joint kde_joint().bandwidth = np.eye(len(evidence) + 1) - assert np.all(cpd.kde_joint().bandwidth == np.eye(len(evidence) + 1)), "kde_joint do not return a reference to the KDE joint, but a copy." - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + assert np.all( + cpd.kde_joint().bandwidth == np.eye(len(evidence) + 1) + ), "kde_joint do not return a reference to the KDE joint, but a copy." + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_kde_joint_iter(variable, evidence, df) _test_ckde_kde_joint_iter(variable, evidence, df_float) + def test_ckde_kde_marg(): def _test_ckde_kde_marg_iter(variable, evidence, _df): cpd = pbn.CKDE(variable, evidence) @@ -61,20 +80,31 @@ def _test_ckde_kde_marg_iter(variable, evidence, _df): if evidence: assert kde_marg().fitted() kde_marg().bandwidth = np.eye(len(evidence)) - assert np.all(cpd.kde_marg().bandwidth == np.eye(len(evidence))), "kde_marg do not return a reference to the KDE joint, but a copy." + assert np.all( + cpd.kde_marg().bandwidth == np.eye(len(evidence)) + ), "kde_marg do not return a reference to the KDE joint, but a copy." else: # kde_marg contains garbage if there is no evidence pass - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_kde_marg_iter(variable, evidence, df) _test_ckde_kde_marg_iter(variable, evidence, df_float) + def test_ckde_fit(): def _test_ckde_fit(variables, _df, instances): npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) cpd = pbn.CKDE(variable, evidence) assert not cpd.fitted() @@ -83,19 +113,27 @@ def _test_ckde_fit(variables, _df, instances): kde_joint = cpd.kde_joint assert np.all(np.isclose(kde_joint().bandwidth, scipy_kde.covariance)) - + if evidence: kde_marg = cpd.kde_marg - assert np.all(np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:,1:])) - + assert np.all( + np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:, 1:]) + ) + assert cpd.num_instances() == instances - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: variables = [variable] + evidence - for instances in [50, 1000, 10000]: + for instances in [50, 1000, 10000]: _test_ckde_fit(variables, df, instances) _test_ckde_fit(variables, df_float, instances) + def test_ckde_fit_null(): def _test_ckde_fit_null(variable, evidence, variables, _df, instances): cpd = pbn.CKDE(variable, evidence) @@ -104,45 +142,56 @@ def _test_ckde_fit_null(variable, evidence, variables, _df, instances): assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) - npdata_no_null = npdata_instances[~nan_rows,:] - scipy_kde = gaussian_kde(npdata_no_null.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + npdata_no_null = npdata_instances[~nan_rows, :] + scipy_kde = gaussian_kde( + npdata_no_null.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) kde_joint = cpd.kde_joint assert np.all(np.isclose(kde_joint().bandwidth, scipy_kde.covariance)) - + if evidence: kde_marg = cpd.kde_marg - assert np.all(np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:,1:])) + assert np.all( + np.isclose(kde_marg().bandwidth, scipy_kde.covariance[1:, 1:]) + ) assert cpd.num_instances() == scipy_kde.n np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: variables = [variable] + evidence - for instances in [50, 1000, 10000]: + for instances in [50, 1000, 10000]: _test_ckde_fit_null(variable, evidence, variables, df, instances) _test_ckde_fit_null(variable, evidence, variables, df_float, instances) + def train_scipy_ckde(data, variable, evidence): variables = [variable] + evidence npdata_joint = data.loc[:, variables].to_numpy() @@ -150,15 +199,20 @@ def train_scipy_ckde(data, variable, evidence): nan_rows = np.any(np.isnan(npdata_joint), axis=1) - scipy_kde_joint = gaussian_kde(npdata_joint[~nan_rows,:].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde_joint = gaussian_kde( + npdata_joint[~nan_rows, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor(), + ) if evidence: - scipy_kde_marg = gaussian_kde(npdata_marg[~nan_rows,:].T, bw_method=scipy_kde_joint.covariance_factor()) + scipy_kde_marg = gaussian_kde( + npdata_marg[~nan_rows, :].T, bw_method=scipy_kde_joint.factor + ) else: scipy_kde_marg = None return scipy_kde_joint, scipy_kde_marg + def scipy_ckde_logpdf(test_data, joint_kde, marg_kde, variable, evidence): variables = [variable] + evidence test_data_joint = test_data.loc[:, variables].to_numpy() @@ -172,12 +226,15 @@ def scipy_ckde_logpdf(test_data, joint_kde, marg_kde, variable, evidence): result = np.full(test_data.shape[0], np.nan, dtype=np.float64) if evidence: - result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows,:].T) - marg_kde.logpdf(test_data_marg[~nan_rows,:].T) + result[~nan_rows] = joint_kde.logpdf( + test_data_joint[~nan_rows, :].T + ) - marg_kde.logpdf(test_data_marg[~nan_rows, :].T) else: - result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows,:].T) + result[~nan_rows] = joint_kde.logpdf(test_data_joint[~nan_rows, :].T) return result + def scipy_ckde_cdf(test_data, joint_kde, marg_kde, variable, evidence): variables = [variable] + evidence test_data_joint = test_data.loc[:, variables].to_numpy() @@ -196,28 +253,46 @@ def scipy_ckde_cdf(test_data, joint_kde, marg_kde, variable, evidence): if evidence: bandwidth = joint_kde.covariance - cond_var = bandwidth[0,0] - bandwidth[0, 1:].dot(np.linalg.inv(bandwidth[1:, 1:])).dot(bandwidth[1:, 0]) + cond_var = bandwidth[0, 0] - bandwidth[0, 1:].dot( + np.linalg.inv(bandwidth[1:, 1:]) + ).dot(bandwidth[1:, 0]) for test_index in np.where(~np.any(np.isnan(test_data_joint), axis=1))[0]: - w = mvn.logpdf(marg_kde.dataset.T, mean=test_data_marg[test_index,:], cov=marg_kde.covariance) + w = mvn.logpdf( + marg_kde.dataset.T, + mean=test_data_marg[test_index, :], + cov=marg_kde.covariance, + ) w = np.exp(w) total_w[:, test_index] = w - evidence_diff = test_data_marg[test_index,:] - joint_kde.dataset[1:,:].T - cond_mean = joint_kde.dataset[0,:] + bandwidth[0,1:].dot(np.linalg.inv(bandwidth[1:,1:])).dot(evidence_diff.T) + evidence_diff = test_data_marg[test_index, :] - joint_kde.dataset[1:, :].T + cond_mean = joint_kde.dataset[0, :] + bandwidth[0, 1:].dot( + np.linalg.inv(bandwidth[1:, 1:]) + ).dot(evidence_diff.T) conditional_mean[:, test_index] = cond_mean - total_cdf[:, test_index] = norm.cdf(test_data_joint[test_index,0], cond_mean, np.sqrt(cond_var)) + total_cdf[:, test_index] = norm.cdf( + test_data_joint[test_index, 0], cond_mean, np.sqrt(cond_var) + ) - result[test_index] = np.dot(w, norm.cdf(test_data_joint[test_index,0], cond_mean, np.sqrt(cond_var))) + result[test_index] = np.dot( + w, + norm.cdf(test_data_joint[test_index, 0], cond_mean, np.sqrt(cond_var)), + ) result /= np.sum(total_w, axis=0) else: - cdf = norm.cdf(test_data_joint[~nan_rows], joint_kde.dataset, np.sqrt(joint_kde.covariance[0,0])) + cdf = norm.cdf( + test_data_joint[~nan_rows], + joint_kde.dataset, + np.sqrt(joint_kde.covariance[0, 0]), + ) result[~nan_rows] = np.sum((1 / joint_kde.n) * cdf, axis=1) return result + def test_ckde_logl(): def _test_ckde_logl(variable, evidence, _df, _test_df): cpd = pbn.CKDE(variable, evidence) @@ -225,33 +300,45 @@ def _test_ckde_logl(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) logl = cpd.logl(_test_df) - scipy = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=0.0005)) else: assert np.all(np.isclose(logl, scipy)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_logl(variable, evidence, df, test_df) _test_ckde_logl(variable, evidence, df_small, test_df) _test_ckde_logl(variable, evidence, df_float, test_df_float) _test_ckde_logl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_ckde_logl_null(): def _test_ckde_logl_null(variable, evidence, _df, _test_df): @@ -261,15 +348,17 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) logl = cpd.logl(_test_df) - scipy = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=0.0005, equal_nan=True)) else: assert np.all(np.isclose(logl, scipy, equal_nan=True)) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -278,40 +367,50 @@ def _test_ckde_logl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_logl_null(variable, evidence, df, df_null) _test_ckde_logl_null(variable, evidence, df_small, df_null) _test_ckde_logl_null(variable, evidence, df_float, df_null_float) _test_ckde_logl_null(variable, evidence, df_small_float, df_null_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) ll = cpd.logl(df_null) ll2 = cpd2.logl(df_null) - assert np.all(np.isclose(ll, ll2, equal_nan=True)), "Order of evidence changes the position of nan values." + assert np.all( + np.isclose(ll, ll2, equal_nan=True) + ), "Order of evidence changes the position of nan values." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) ll = cpd.logl(df_null_float) ll2 = cpd2.logl(df_null_float) - assert np.all(np.isclose(ll, ll2, equal_nan=True)), "Order of evidence changes the position of nan values." + assert np.all( + np.isclose(ll, ll2, equal_nan=True) + ), "Order of evidence changes the position of nan values." + def test_ckde_slogl(): def _test_ckde_slogl(variable, evidence, _df, _test_df): @@ -319,34 +418,48 @@ def _test_ckde_slogl(variable, evidence, _df, _test_df): cpd.fit(_df) scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) - scipy_logl = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy_logl = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): # Allow an error of 0.0005 for each training instance. - assert np.isclose(cpd.slogl(_test_df), scipy_logl.sum(), atol=0.0005*_df.shape[0]) + assert np.isclose( + cpd.slogl(_test_df), scipy_logl.sum(), atol=0.0005 * _df.shape[0] + ) else: assert np.isclose(cpd.slogl(_test_df), scipy_logl.sum()) - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_slogl(variable, evidence, df, test_df) _test_ckde_slogl(variable, evidence, df_small, test_df) _test_ckde_slogl(variable, evidence, df_float, test_df_float) _test_ckde_slogl(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) + ), "Order of evidence changes slogl() result." + def test_ckde_slogl_null(): def _test_ckde_slogl_null(variable, evidence, _df, _test_df): @@ -354,17 +467,20 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): cpd.fit(_df) scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) - scipy_logl = scipy_ckde_logpdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy_logl = scipy_ckde_logpdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) if np.all(_test_df.dtypes == "float32"): # Allow an error of 0.0005 for each training instance. - assert np.isclose(cpd.slogl(_test_df), np.nansum(scipy_logl), atol=0.0005*_df.shape[0]) + assert np.isclose( + cpd.slogl(_test_df), np.nansum(scipy_logl), atol=0.0005 * _df.shape[0] + ) else: assert np.isclose(cpd.slogl(_test_df), np.nansum(scipy_logl)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -373,35 +489,44 @@ def _test_ckde_slogl_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_slogl_null(variable, evidence, df, df_null) _test_ckde_slogl_null(variable, evidence, df_small, df_null) _test_ckde_slogl_null(variable, evidence, df_float, df_null_float) _test_ckde_slogl_null(variable, evidence, df_small_float, df_null_float) - - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) + ), "Order of evidence changes slogl() result." + def test_ckde_cdf(): def _test_ckde_cdf(variable, evidence, _df, _test_df): @@ -410,33 +535,45 @@ def _test_ckde_cdf(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) cdf = cpd.cdf(_test_df) - scipy = scipy_ckde_cdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_cdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(cdf, scipy, atol=0.0005)) else: assert np.all(np.isclose(cdf, scipy)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_cdf(variable, evidence, df, test_df) _test_ckde_cdf(variable, evidence, df_small, test_df) _test_ckde_cdf(variable, evidence, df_float, test_df_float) _test_ckde_cdf(variable, evidence, df_small_float, test_df_float) - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.cdf(test_df_float), cpd2.cdf(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.cdf(test_df_float), cpd2.cdf(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_ckde_cdf_null(): def _test_ckde_cdf_null(variable, evidence, _df, _test_df): @@ -446,16 +583,17 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): scipy_kde_joint, scipy_kde_marg = train_scipy_ckde(_df, variable, evidence) cdf = cpd.cdf(_test_df) - scipy = scipy_ckde_cdf(_test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence) + scipy = scipy_ckde_cdf( + _test_df, scipy_kde_joint, scipy_kde_marg, variable, evidence + ) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(cdf, scipy, atol=0.0005, equal_nan=True)) else: assert np.all(np.isclose(cdf, scipy, equal_nan=True)) - - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -464,91 +602,105 @@ def _test_ckde_cdf_null(variable, evidence, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: _test_ckde_cdf_null(variable, evidence, df, df_null) _test_ckde_cdf_null(variable, evidence, df_small, df_null) _test_ckde_cdf_null(variable, evidence, df_float, df_null_float) _test_ckde_cdf_null(variable, evidence, df_small_float, df_null_float) - - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True)), "Order of evidence changes cdf() result." + assert np.all( + np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True) + ), "Order of evidence changes cdf() result." - cpd = pbn.CKDE('d', ['a', 'b', 'c']) + cpd = pbn.CKDE("D", ["A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.CKDE('d', ['c', 'b', 'a']) + cpd2 = pbn.CKDE("D", ["C", "B", "A"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.cdf(df_null_float), - cpd2.cdf(df_null_float), - atol=0.0005, equal_nan=True)), "Order of evidence changes cdf() result." + assert np.all( + np.isclose( + cpd.cdf(df_null_float), cpd2.cdf(df_null_float), atol=0.0005, equal_nan=True + ) + ), "Order of evidence changes cdf() result." + def test_ckde_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.CKDE('a', []) + cpd = pbn.CKDE("A", []) cpd.fit(df) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('b', ['a']) + + cpd = pbn.CKDE("B", ["A"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('c', ['a', 'b']) + + cpd = pbn.CKDE("C", ["A", "B"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0), - 'b': np.full((SAMPLE_SIZE,), 7.45)}) + sampling_df = pd.DataFrame( + {"A": np.full((SAMPLE_SIZE,), 3.0), "B": np.full((SAMPLE_SIZE,), 7.45)} + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - cpd = pbn.CKDE('a', []) + cpd = pbn.CKDE("A", []) cpd.fit(df_float) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('b', ['a']) + + cpd = pbn.CKDE("B", ["A"]) cpd.fit(df_float) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float32() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.CKDE('c', ['a', 'b']) + + cpd = pbn.CKDE("C", ["A", "B"]) cpd.fit(df_float) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), - 'b': np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32)}) + sampling_df = pd.DataFrame( + { + "A": np.full((SAMPLE_SIZE,), 3.0, dtype=np.float32), + "B": np.full((SAMPLE_SIZE,), 7.45, dtype=np.float32), + } + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float32() - assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE \ No newline at end of file + assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE diff --git a/tests/factors/continuous/KDE_test.py b/tests/factors/continuous/KDE_test.py index f78ba2d0..3d50ab91 100644 --- a/tests/factors/continuous/KDE_test.py +++ b/tests/factors/continuous/KDE_test.py @@ -1,18 +1,23 @@ -import pytest import numpy as np import pyarrow as pa import pybnesian as pbn -from pybnesian import BandwidthSelector +import pytest +from helpers.data import generate_normal_data from scipy.stats import gaussian_kde -import util_test - SIZE = 500 -df = util_test.generate_normal_data(SIZE, seed=0) -df_float = df.astype('float32') +df = generate_normal_data(SIZE, seed=0) +df_float = df.astype("float32") + + +def test_check_type() -> None: + """ + Tests that the KDE factor raises a ValueError when the data type of the test dataset + is different from the data type of the training dataset during log-likelihood and + smoothed log-likelihood computations. + """ -def test_check_type(): - cpd = pbn.KDE(['a']) + cpd = pbn.KDE(["A"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -29,37 +34,65 @@ def test_check_type(): cpd.slogl(df) assert "Data type of training and test datasets is different." in str(ex.value) + def test_kde_variables(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + """ + Tests the initialization of the KDE class with different sets of variables. + For each list of variable names, this test creates a KDE object and asserts + that the object's variables match the input list. This ensures that the KDE + class correctly stores and returns its variables upon initialization. + """ + + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: cpd = pbn.KDE(variables) assert cpd.variables() == variables + def test_kde_bandwidth(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + """ + Tests the bandwidth selection and assignment functionality of the KDE class. + This test verifies: + - That the KDE bandwidth computed using the normal reference rule matches the output of scipy's gaussian_kde with a custom bandwidth method, for various variable sets and sample sizes. + - That the KDE bandwidth computed using Scott's rule matches the output of scipy's gaussian_kde default bandwidth, for various variable sets and sample sizes. + - That the bandwidth attribute of the KDE object can be manually set and correctly reflects the assigned value. + The test is performed for both integer and float dataframes. + """ + + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 1000, 10000]: npdata = df.loc[:, variables].to_numpy() # Test normal reference rule - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) cpd = pbn.KDE(variables) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with normal reference rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with normal reference rule." scipy_kde = gaussian_kde(npdata[:instances, :].T) cpd = pbn.KDE(variables, pbn.ScottsBandwidth()) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with Scott's rule." + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with Scott's rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)), "Wrong bandwidth computed with Scott's rule." - + assert np.all( + np.isclose(cpd.bandwidth, scipy_kde.covariance) + ), "Wrong bandwidth computed with Scott's rule." - cpd = pbn.KDE(['a']) + cpd = pbn.KDE(["A"]) cpd.fit(df) cpd.bandwidth = [[1]] assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." @@ -68,34 +101,77 @@ def test_kde_bandwidth(): cpd.bandwidth = [[1]] assert cpd.bandwidth == np.asarray([[1]]), "Could not change bandwidth." -class UnitaryBandwidth(BandwidthSelector): + +class UnitaryBandwidth(pbn.BandwidthSelector): + """ + A bandwidth selector that returns the identity matrix as the bandwidth. + This class is a subclass of `pbn.BandwidthSelector` and implements a simple bandwidth selection strategy + where the bandwidth matrix is always the identity matrix of size equal to the number of variables. + Methods + ------- + __init__(): + Initializes the UnitaryBandwidth selector. + bandwidth(df, variables): + Returns the identity matrix of shape (len(variables), len(variables)) as the bandwidth matrix. + Parameters + ---------- + df : pandas.DataFrame + The data frame containing the data (not used in this selector). + variables : list + The list of variables for which the bandwidth is to be computed. + Returns + ------- + numpy.ndarray + An identity matrix of size equal to the number of variables. + """ + def __init__(self): - BandwidthSelector.__init__(self) + pbn.BandwidthSelector.__init__(self) def bandwidth(self, df, variables): return np.eye(len(variables)) + def test_kde_new_bandwidth(): - kde = pbn.KDE(["a"], UnitaryBandwidth()) + """ + Tests the behavior of the KDE class when using the UnitaryBandwidth bandwidth selector. + This test verifies that: + - When fitting a KDE with a single variable, the resulting bandwidth matrix is the 1x1 identity matrix. + - When fitting a KDE with four variables, the resulting bandwidth matrix is the 4x4 identity matrix. + - The behavior is consistent for both integer and float dataframes. + Assertions: + - The bandwidth matrix after fitting is as expected (identity matrix) for both data types and variable counts. + """ + + kde = pbn.KDE(["A"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.eye(1) kde.fit(df_float) assert kde.bandwidth == np.eye(1) - kde = pbn.KDE(["a", "b", "c", "d"], UnitaryBandwidth()) + kde = pbn.KDE(["A", "B", "C", "D"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.eye(4)) kde.fit(df_float) assert np.all(kde.bandwidth == np.eye(4)) + def test_kde_data_type(): - k = pbn.KDE(["a"]) + """ + Tests the `data_type` method of the KDE factor. + This test verifies that: + - Calling `data_type` before fitting the KDE raises a ValueError with the message "KDE factor not fitted". + - After fitting the KDE with a DataFrame `df`, the returned data type is `pa.float64()`. + - After fitting the KDE with a DataFrame `df_float`, the returned data type is `pa.float32()`. + """ + + k = pbn.KDE(["A"]) with pytest.raises(ValueError) as ex: k.data_type() - "KDE factor not fitted" in str(ex.value) + assert "KDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() @@ -104,42 +180,78 @@ def test_kde_data_type(): def test_kde_fit(): + """ + Tests the fitting process of the KDE (Kernel Density Estimation) class in the PyBNesian library. + This test verifies that: + - The KDE object is not fitted before calling `fit`. + - After fitting with a subset of the provided DataFrame, the KDE object is marked as fitted. + - The number of training instances and variables in the fitted KDE matches those of a reference `scipy.stats.gaussian_kde` object. + - The test is performed for different combinations of variables and different numbers of training instances, using both integer and float DataFrames. + Tested scenarios: + - Single and multiple variable KDEs. + - Different sample sizes (50, 150, 500). + - Both integer and float data types. + """ + def _test_kde_fit_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata[:instances, :].T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata[:instances, :].T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) assert scipy_kde.n == cpd.num_instances(), "Wrong number of training instances." assert scipy_kde.d == cpd.num_variables(), "Wrong number of training variables." - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_kde_fit_iter(variables, df, instances) _test_kde_fit_iter(variables, df_float, instances) + def test_kde_fit_null(): + """ + Test the fitting of the KDE (Kernel Density Estimator) model when input data contains null (NaN) values. + This test verifies that: + - The KDE model is not fitted before calling `fit` and is fitted after. + - The model correctly ignores rows with null values during fitting. + - The number of training instances and variables in the fitted model matches those in a reference `scipy.stats.gaussian_kde` fitted on the same data with nulls removed. + - The computed bandwidth (covariance) of the KDE matches that of the reference implementation. + The test is performed for different combinations of variables and different numbers of training instances, using both integer and float dataframes with randomly inserted NaN values. + """ + def _test_kde_fit_null_iter(variables, _df, instances): cpd = pbn.KDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) - npdata_no_null = npdata_instances[~nan_rows,:] - scipy_kde = gaussian_kde(npdata_no_null.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) - - assert scipy_kde.n == cpd.num_instances(), "Wrong number of training instances with null values." - assert scipy_kde.d == cpd.num_variables(), "Wrong number of training variables with null values." - assert np.all(np.isclose(scipy_kde.covariance, cpd.bandwidth)), "Wrong bandwidth with null values." + npdata_no_null = npdata_instances[~nan_rows, :] + scipy_kde = gaussian_kde( + npdata_no_null.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) + + assert ( + scipy_kde.n == cpd.num_instances() + ), "Wrong number of training instances with null values." + assert ( + scipy_kde.d == cpd.num_variables() + ), "Wrong number of training variables with null values." + assert np.all( + np.isclose(scipy_kde.covariance, cpd.bandwidth) + ), "Wrong bandwidth with null values." np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -148,86 +260,145 @@ def _test_kde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_kde_fit_null_iter(variables, df_null, instances) _test_kde_fit_null_iter(variables, df_null_float, instances) + def test_kde_logl(): + """Tests the logl() method of the KDE factor. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_logl_iter(variables, _df, _test_df): - cpd = pbn.KDE(variables) + """Tests that the logl() method of the KDE factor returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + npdata = _df.loc[:, variables].to_numpy() + cpd = pbn.KDE( + variables, + # bandwidth_selector=pbn.ScottsBandwidth(), + bandwidth_selector=pbn.NormalReferenceRule(), + ) cpd.fit(_df) - npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + dataset=npdata.T, + # bw_method="scott", + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), # Normal Reference Rule multiplies Scott's factor and then standard deviation + ) + + # TODO: Add tests to check this + # NOTE + # scipy_kde.factor == scipy_kde.covariance_factor() <-- coefficient (kde.factor) that squared, multiplies the data covariance matrix to obtain the kernel covariance matrix. + # scipy_kde.covariance == scipy_kde.factor ** 2 * npdata.var() + # scipy_kde.inv_cov == 1 / scipy_kde.covariance + # We check that the bandwidth is the same + # TODO: Add tests to check "scott" bandwidth selectors + assert np.all(np.isclose(cpd.bandwidth, scipy_kde.covariance)) test_npdata = _test_df.loc[:, variables].to_numpy() logl = cpd.logl(_test_df) - scipy = scipy_kde.logpdf(test_npdata.T) + scipy_logl = scipy_kde.logpdf(test_npdata.T) - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(logl, scipy, atol=0.0005)) + if np.all(_df.dtypes == "float32"): + assert np.all(np.isclose(logl, scipy_logl, atol=0.0005)) else: - assert np.all(np.isclose(logl, scipy)) + assert np.all(np.isclose(logl, scipy_logl)) - test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(50, seed=1) + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_logl_iter(variables, df, test_df) _test_kde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float)) + ), "Order of evidence changes logl() result." + def test_kde_logl_null(): + """Tests the logl() method of the KDE factor with null values. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_logl_null_iter(variables, _df, _test_df): + """Tests that the logl() method of the KDE factor with null values returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) - - test_npdata = _test_df.loc[:, variables].to_numpy() - - logl = cpd.logl(_test_df) - - scipy_result = np.full((test_npdata.shape[0],), np.nan) - nan_rows = np.any(np.isnan(test_npdata), axis=1) - scipy_result[~nan_rows] = scipy_kde.logpdf(test_npdata[~nan_rows].T) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) + # We initialize the logl and scipy_logl columns with NaN + _test_df["logl"] = np.nan + _test_df["scipy_logl"] = np.nan + + # We calculate the logl with the KDE factor + _test_df["logl"] = cpd.logl(_test_df) + + # We calculate the logl with scipy (we have to avoid NaN values) + non_nan_index = _test_df[variables].notna().all(1) + _test_df.loc[non_nan_index, "scipy_logl"] = scipy_kde.logpdf( + _test_df.loc[non_nan_index, variables].T.to_numpy() + ) if npdata.dtype == "float32": - assert np.all(np.isclose(logl, scipy_result, atol=0.0005, equal_nan=True)) + assert np.all( + np.isclose( + _test_df["logl"], + _test_df["scipy_logl"], + atol=0.0005, + equal_nan=True, + ) + ) else: - assert np.all(np.isclose(logl, scipy_result, equal_nan=True)) + assert np.all( + np.isclose(_test_df["logl"], _test_df["scipy_logl"], equal_nan=True) + ) TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -236,86 +407,130 @@ def _test_kde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_logl_null_iter(variables, df, df_null) _test_kde_logl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) + ), "Order of evidence changes logl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(df_null_float), cpd2.logl(df_null_float), atol=0.0005, equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose( + cpd.logl(df_null_float), + cpd2.logl(df_null_float), + atol=0.0005, + equal_nan=True, + ) + ), "Order of evidence changes logl() result." + def test_kde_slogl(): + """Tests the slogl() method of the KDE factor. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_slogl_iter(variables, _df, _test_df): + """Tests that the logl() method of the KDE factor returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) test_npdata = _test_df.loc[:, variables].to_numpy() - assert np.all(np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata.T).sum())) - + assert np.all( + np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata.T).sum()) + ) - test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(50, seed=1) + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_slogl_iter(variables, df, test_df) _test_kde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float)) + ), "Order of evidence changes slogl() result." def test_kde_slogl_null(): + """Tests the slogl() method of the KDE factor with null values. It compares the results with the ones obtained with scipy's product_kde. + Both for float64 and float32 data types.""" + def _test_kde_slogl_null_iter(variables, _df, _test_df): + """Tests that the slogl() method of the KDE factor with null values returns the same results as scipy's product_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.KDE(variables) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) - - test_npdata = _test_df.loc[:, variables].to_numpy() - - nan_rows = np.any(np.isnan(test_npdata), axis=1) - - assert np.all(np.isclose(cpd.slogl(_test_df), scipy_kde.logpdf(test_npdata[~nan_rows].T).sum())) + scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) + # We initialize the logl and scipy_logl columns with NaN + _test_df["scipy_logl"] = np.nan + slogl = cpd.slogl(_test_df) + # We calculate the logl with scipy (we have to avoid NaN values) + non_nan_index = _test_df[variables].notna().all(1) + scipy_slogl = scipy_kde.logpdf( + _test_df.loc[non_nan_index, variables].T.to_numpy() + ).sum() + + assert np.all(np.isclose(slogl, scipy_slogl)) TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -324,30 +539,33 @@ def _test_kde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_kde_slogl_null_iter(variables, df, df_null) _test_kde_slogl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.KDE(['d', 'a', 'b', 'c']) + cpd = pbn.KDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.KDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.KDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float)) + ), "Order of evidence changes slogl() result." diff --git a/tests/factors/continuous/LinearGaussianCPD_test.py b/tests/factors/continuous/LinearGaussianCPD_test.py index 546e7691..56b85a20 100644 --- a/tests/factors/continuous/LinearGaussianCPD_test.py +++ b/tests/factors/continuous/LinearGaussianCPD_test.py @@ -2,23 +2,30 @@ import pandas as pd import pyarrow as pa import pybnesian as pbn - -import pytest +from helpers.data import DATA_SIZE, generate_normal_data from scipy.stats import norm -import util_test - -SIZE = 10000 +df = generate_normal_data(DATA_SIZE) -df = util_test.generate_normal_data(SIZE) def test_lg_variable(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.variable() == variable + def test_lg_evidence(): - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert cpd.evidence() == evidence @@ -26,53 +33,69 @@ def test_lg_evidence(): def fit_numpy(_df, variable, evidence): df_na = _df.loc[:, [variable] + evidence].dropna() linregress_data = np.column_stack((np.ones(df_na.shape[0]), df_na.loc[:, evidence])) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, df_na.loc[:, variable], rcond=None) - + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, df_na.loc[:, variable], rcond=None + ) + return beta, res / (df_na.count()[variable] - len(evidence) - 1) + def test_lg_data_type(): - cpd = pbn.LinearGaussianCPD("a", []) + cpd = pbn.LinearGaussianCPD("A", []) assert cpd.data_type() == pa.float64() + def test_lg_fit(): - for variable, evidence in [("a", []), ("b", ["a"]), ("c", ["a", "b"]), ("d", ["a", "b", "c"])]: + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() cpd.fit(df) assert cpd.fitted() npbeta, npvar = fit_numpy(df, variable, evidence) - + assert np.all(np.isclose(npbeta, cpd.beta)), "Wrong beta vector." assert np.all(np.isclose(npvar, cpd.variance)), "Wrong variance." + def test_lg_fit_null(): np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [("a", []), ("b", ["a"]), ("c", ["a", "b"]), ("d", ["a", "b", "c"])]: + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) assert not cpd.fitted() cpd.fit(df_null) assert cpd.fitted() npbeta, npvar = fit_numpy(df_null, variable, evidence) - + assert np.all(np.isclose(npbeta, cpd.beta)), "Wrong beta vector." assert np.all(np.isclose(npvar, cpd.variance)), "Wrong variance." + def numpy_logpdf(test_df, variable, evidence, beta, variance): npdata = test_df.loc[:, evidence].to_numpy() - means = beta[0] + np.sum(beta[1:]*npdata, axis=1) + means = beta[0] + np.sum(beta[1:] * npdata, axis=1) result = np.empty((test_df.shape[0],)) @@ -81,12 +104,17 @@ def numpy_logpdf(test_df, variable, evidence, beta, variance): isnan_vec[np.isnan(test_df.loc[:, variable].to_numpy())] = True result[isnan_vec] = np.nan - result[~isnan_vec] = norm.logpdf(test_df.loc[:, variable].to_numpy()[~isnan_vec], means[~isnan_vec], np.sqrt(variance)) + result[~isnan_vec] = norm.logpdf( + test_df.loc[:, variable].to_numpy()[~isnan_vec], + means[~isnan_vec], + np.sqrt(variance), + ) return result + def numpy_cdf(test_df, variable, evidence, beta, variance): npdata = test_df.loc[:, evidence].to_numpy() - means = beta[0] + np.sum(beta[1:]*npdata, axis=1) + means = beta[0] + np.sum(beta[1:] * npdata, axis=1) result = np.empty((test_df.shape[0],)) @@ -95,32 +123,54 @@ def numpy_cdf(test_df, variable, evidence, beta, variance): isnan_vec[np.isnan(test_df.loc[:, variable].to_numpy())] = True result[isnan_vec] = np.nan - result[~isnan_vec] = norm.cdf(test_df.loc[:, variable].to_numpy()[~isnan_vec], means[~isnan_vec], np.sqrt(variance)) + result[~isnan_vec] = norm.cdf( + test_df.loc[:, variable].to_numpy()[~isnan_vec], + means[~isnan_vec], + np.sqrt(variance), + ) return result -def test_lg_logl(): - test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: +def test_lg_logl(): + test_df = generate_normal_data(5000) + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.logl(test_df), numpy_logpdf(test_df, variable, evidence, beta, variance))),\ - "Wrong logl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.logl(test_df), + numpy_logpdf(test_df, variable, evidence, beta, variance), + ) + ), ( + "Wrong logl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "The order of the evidence changes the logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "The order of the evidence changes the logl() result." + def test_lg_logl_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) @@ -129,55 +179,87 @@ def test_lg_logl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose( - cpd.logl(test_df), - numpy_logpdf(test_df, variable, evidence, beta, variance), equal_nan=True)),\ - "Wrong logl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.logl(test_df), + numpy_logpdf(test_df, variable, evidence, beta, variance), + equal_nan=True, + ) + ), ( + "Wrong logl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose( - cpd.logl(test_df), - cpd2.logl(test_df), equal_nan=True)),\ - "The order of the evidence changes the logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df), equal_nan=True) + ), "The order of the evidence changes the logl() result." -def test_lg_slogl(): - test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: +def test_lg_slogl(): + test_df = generate_normal_data(5000) + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.slogl(test_df), np.sum(numpy_logpdf(test_df, variable, evidence, beta, variance)))),\ - "Wrong slogl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.slogl(test_df), + np.sum(numpy_logpdf(test_df, variable, evidence, beta, variance)), + ) + ), ( + "Wrong slogl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "The order of the evidence changes the slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "The order of the evidence changes the slogl() result." + def test_lg_slogl_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) @@ -186,51 +268,85 @@ def test_lg_slogl_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.slogl(df_null), np.nansum(numpy_logpdf(df_null, variable, evidence, beta, variance)))),\ - "Wrong slogl for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.slogl(df_null), + np.nansum(numpy_logpdf(df_null, variable, evidence, beta, variance)), + ) + ), ( + "Wrong slogl for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "The order of the evidence changes the slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "The order of the evidence changes the slogl() result." -def test_lg_cdf(): - test_df = util_test.generate_normal_data(5000) - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: +def test_lg_cdf(): + test_df = generate_normal_data(5000) + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose(cpd.cdf(test_df), numpy_cdf(test_df, variable, evidence, beta, variance))),\ - "Wrong cdf for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ")" - - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.cdf(test_df), numpy_cdf(test_df, variable, evidence, beta, variance) + ) + ), ( + "Wrong cdf for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ")" + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df))), "The order of the evidence changes the cdf() result." + assert np.all( + np.isclose(cpd.cdf(test_df), cpd2.cdf(test_df)) + ), "The order of the evidence changes the cdf() result." + def test_lg_cdf_null(): - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) np.random.seed(0) a_null = np.random.randint(0, 5000, size=100) @@ -239,59 +355,74 @@ def test_lg_cdf_null(): d_null = np.random.randint(0, 5000, size=100) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - - for variable, evidence in [('a', []), ('b', ['a']), ('c', ['a', 'b']), ('d', ['a', 'b', 'c'])]: + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan + + for variable, evidence in [ + ("A", []), + ("B", ["A"]), + ("C", ["A", "B"]), + ("D", ["A", "B", "C"]), + ]: cpd = pbn.LinearGaussianCPD(variable, evidence) cpd.fit(df) beta = cpd.beta variance = cpd.variance - assert np.all(np.isclose( - cpd.cdf(df_null), - numpy_cdf(df_null, variable, evidence, beta, variance), equal_nan=True)),\ - "Wrong cdf for LinearGaussianCPD(" + str(variable) + " | " + str(evidence) + ") with null values." - - cpd = pbn.LinearGaussianCPD('d', ['a', 'b', 'c']) + assert np.all( + np.isclose( + cpd.cdf(df_null), + numpy_cdf(df_null, variable, evidence, beta, variance), + equal_nan=True, + ) + ), ( + "Wrong cdf for LinearGaussianCPD(" + + str(variable) + + " | " + + str(evidence) + + ") with null values." + ) + + cpd = pbn.LinearGaussianCPD("D", ["A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.LinearGaussianCPD('d', ['c', 'a', 'b']) + cpd2 = pbn.LinearGaussianCPD("D", ["C", "A", "B"]) cpd2.fit(df) - assert np.all(np.isclose( - cpd.cdf(df_null), - cpd2.cdf(df_null), equal_nan=True)),\ - "The order of the evidence changes the cdf() result." + assert np.all( + np.isclose(cpd.cdf(df_null), cpd2.cdf(df_null), equal_nan=True) + ), "The order of the evidence changes the cdf() result." + def test_lg_sample(): SAMPLE_SIZE = 1000 - cpd = pbn.LinearGaussianCPD('a', []) + cpd = pbn.LinearGaussianCPD("A", []) cpd.fit(df) - + sampled = cpd.sample(SAMPLE_SIZE, None, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.LinearGaussianCPD('b', ['a']) + + cpd = pbn.LinearGaussianCPD("B", ["A"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0)}) + sampling_df = pd.DataFrame({"A": np.full((SAMPLE_SIZE,), 3.0)}) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE - - cpd = pbn.LinearGaussianCPD('c', ['a', 'b']) + + cpd = pbn.LinearGaussianCPD("C", ["A", "B"]) cpd.fit(df) - sampling_df = pd.DataFrame({'a': np.full((SAMPLE_SIZE,), 3.0), - 'b': np.full((SAMPLE_SIZE,), 7.45)}) + sampling_df = pd.DataFrame( + {"A": np.full((SAMPLE_SIZE,), 3.0), "B": np.full((SAMPLE_SIZE,), 7.45)} + ) sampled = cpd.sample(SAMPLE_SIZE, sampling_df, 0) assert sampled.type == pa.float64() - assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE \ No newline at end of file + assert int(sampled.nbytes / (sampled.type.bit_width / 8)) == SAMPLE_SIZE diff --git a/tests/factors/continuous/ProductKDE_test.py b/tests/factors/continuous/ProductKDE_test.py index 17625cd9..f8cc6fee 100644 --- a/tests/factors/continuous/ProductKDE_test.py +++ b/tests/factors/continuous/ProductKDE_test.py @@ -1,19 +1,17 @@ -import pytest import numpy as np import pyarrow as pa import pybnesian as pbn -from pybnesian import BandwidthSelector +import pytest +from helpers.data import generate_normal_data from scipy.stats import gaussian_kde -from functools import reduce - -import util_test SIZE = 500 -df = util_test.generate_normal_data(SIZE, seed=0) -df_float = df.astype('float32') +df = generate_normal_data(SIZE, seed=0) +df_float = df.astype("float32") + def test_check_type(): - cpd = pbn.ProductKDE(['a']) + cpd = pbn.ProductKDE(["A"]) cpd.fit(df) with pytest.raises(ValueError) as ex: cpd.logl(df_float) @@ -30,11 +28,13 @@ def test_check_type(): cpd.slogl(df) assert "Data type of training and test datasets is different." in str(ex.value) + def test_productkde_variables(): - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: cpd = pbn.ProductKDE(variables) assert cpd.variables() == variables + def py_nr_bandwidth(df, variables): cov = df[variables].cov().to_numpy() delta = np.linalg.inv(np.diag(np.diag(cov))).dot(cov) @@ -42,9 +42,15 @@ def py_nr_bandwidth(df, variables): N = df.shape[0] d = len(variables) - k = 4*d*np.sqrt(np.linalg.det(delta))/ (2*(delta_inv.dot(delta_inv)).trace() + delta_inv.trace()**2) + k = ( + 4 + * d + * np.sqrt(np.linalg.det(delta)) + / (2 * (delta_inv.dot(delta_inv)).trace() + delta_inv.trace() ** 2) + ) return np.power(k / N, 2 / (d + 4)) * np.diag(cov) + def py_scott_bandwidth(df, variables): var = df[variables].var().to_numpy() N = df.shape[0] @@ -52,26 +58,42 @@ def py_scott_bandwidth(df, variables): return np.power(N, -2 / (d + 4)) * var + def test_productkde_bandwidth(): # for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: - for variables in [['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: cpd = pbn.ProductKDE(variables) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables)) + ), "Wrong bandwidth computed with normal reference rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with normal reference rule." + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(df[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth computed with normal reference rule." cpd = pbn.ProductKDE(variables, pbn.ScottsBandwidth()) cpd.fit(df.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables))), "Wrong bandwidth computed with Scott's rule." + assert np.all( + np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables)) + ), "Wrong bandwidth computed with Scott's rule." cpd.fit(df_float.iloc[:instances]) - assert np.all(np.isclose(cpd.bandwidth, py_scott_bandwidth(df[:instances], variables), atol=0.0005)), "Wrong bandwidth computed with Scott's rule." - - - cpd = pbn.ProductKDE(['a']) + assert np.all( + np.isclose( + cpd.bandwidth, + py_scott_bandwidth(df[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth computed with Scott's rule." + + cpd = pbn.ProductKDE(["A"]) cpd.fit(df) cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." @@ -80,78 +102,112 @@ def test_productkde_bandwidth(): cpd.bandwidth = [1] assert cpd.bandwidth == np.asarray([1]), "Could not change bandwidth." -class UnitaryBandwidth(BandwidthSelector): + +class UnitaryBandwidth(pbn.BandwidthSelector): def __init__(self): - BandwidthSelector.__init__(self) + pbn.BandwidthSelector.__init__(self) def diag_bandwidth(self, df, variables): return np.ones((len(variables),)) - + + def test_productkde_new_bandwidth(): - kde = pbn.ProductKDE(["a"], UnitaryBandwidth()) + kde = pbn.ProductKDE(["A"], UnitaryBandwidth()) kde.fit(df) assert kde.bandwidth == np.ones((1,)) - + kde.fit(df_float) assert kde.bandwidth == np.ones((1,)) - kde = pbn.ProductKDE(["a", "b", "c", "d"], UnitaryBandwidth()) + kde = pbn.ProductKDE(["A", "B", "C", "D"], UnitaryBandwidth()) kde.fit(df) assert np.all(kde.bandwidth == np.ones((4,))) - + kde.fit(df_float) assert np.all(kde.bandwidth == np.ones((4,))) + def test_productkde_data_type(): - k = pbn.ProductKDE(["a"]) + k = pbn.ProductKDE(["A"]) with pytest.raises(ValueError) as ex: k.data_type() - "KDE factor not fitted" in str(ex.value) + assert "KDE factor not fitted" in str(ex.value) k.fit(df) assert k.data_type() == pa.float64() k.fit(df_float) assert k.data_type() == pa.float32() + def test_productkde_fit(): def _test_productkde_fit_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() assert instances == cpd.num_instances(), "Wrong number of training instances." - assert len(variables) == cpd.num_variables(), "Wrong number of training variables." - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables), atol=0.0005)), "Wrong bandwidth." + assert ( + len(variables) == cpd.num_variables() + ), "Wrong number of training variables." + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[:instances], variables), + atol=0.0005, + ) + ), "Wrong bandwidth." else: - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables))), "Wrong bandwidth." + assert np.all( + np.isclose( + cpd.bandwidth, py_nr_bandwidth(_df.iloc[:instances], variables) + ) + ), "Wrong bandwidth." - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_productkde_fit_iter(variables, df, instances) _test_productkde_fit_iter(variables, df_float, instances) + def test_productkde_fit_null(): def _test_productkde_fit_null_iter(variables, _df, instances): cpd = pbn.ProductKDE(variables) assert not cpd.fitted() - cpd.fit(_df.iloc[:instances,:]) + cpd.fit(_df.iloc[:instances, :]) assert cpd.fitted() npdata = _df.loc[:, variables].to_numpy() - npdata_instances = npdata[:instances,:] + npdata_instances = npdata[:instances, :] nan_rows = np.any(np.isnan(npdata_instances), axis=1) nonnan_indices = np.where(~nan_rows)[0] - assert (~nan_rows).sum() == cpd.num_instances(), "Wrong number of training instances with null values." - assert len(variables) == cpd.num_variables(), "Wrong number of training variables with null values." - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables), atol=0.0005)), "Wrong bandwidth with null values." + assert ( + ~nan_rows + ).sum() == cpd.num_instances(), ( + "Wrong number of training instances with null values." + ) + assert ( + len(variables) == cpd.num_variables() + ), "Wrong number of training variables with null values." + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[nonnan_indices, :], variables), + atol=0.0005, + ) + ), "Wrong bandwidth with null values." else: - assert np.all(np.isclose(cpd.bandwidth, py_nr_bandwidth(_df.iloc[nonnan_indices,:], variables))), "Wrong bandwidth with null values." + assert np.all( + np.isclose( + cpd.bandwidth, + py_nr_bandwidth(_df.iloc[nonnan_indices, :], variables), + ) + ), "Wrong bandwidth with null values." np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) @@ -160,18 +216,18 @@ def _test_productkde_fit_null_iter(variables, _df, instances): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: for instances in [50, 150, 500]: _test_productkde_fit_null_iter(variables, df_null, instances) _test_productkde_fit_null_iter(variables, df_null_float, instances) @@ -188,15 +244,25 @@ def factor_product_kernel(train_data): d = train_data.shape[1] num_factor = 4 * d * np.sqrt(np.linalg.det(delta)) - denom_factor = (2 * np.trace(np.dot(delta_inv, delta_inv)) + np.trace(delta_inv)**2) + denom_factor = 2 * np.trace(np.dot(delta_inv, delta_inv)) + np.trace(delta_inv) ** 2 k = num_factor / denom_factor - return (k / N)**(1. / (d + 4.)) + return (k / N) ** (1.0 / (d + 4.0)) def test_productkde_logl(): + """Tests the logl() method of the ProductKDE factor. It compares the results with the ones obtained with scipy's gaussian_kde. + Both for float64 and float32 data types.""" + def _test_productkde_logl_iter(variables, _df, _test_df): + """Tests that the logl() method of the ProductKDE factor returns the same results as scipy's gaussian_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ cpd = pbn.ProductKDE(variables) cpd.fit(_df) @@ -206,41 +272,60 @@ def _test_productkde_logl_iter(variables, _df, _test_df): factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() scipy = final_scipy_kde.logpdf(test_npdata.T) - if np.all(_df.dtypes == 'float32'): + if np.all(_df.dtypes == "float32"): assert np.all(np.isclose(logl, scipy, atol=5e-3)) else: assert np.all(np.isclose(logl, scipy)) - test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(50, seed=1) + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_logl_iter(variables, df, test_df) _test_productkde_logl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(test_df), cpd2.logl(test_df))), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df), cpd2.logl(test_df)) + ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(test_df_float), cpd2.logl(test_df_float), atol=0.0005) + ), "Order of evidence changes logl() result." + def test_productkde_logl_null(): + """Tests the logl() method of the ProductKDE factor with null values. It compares the results with the ones obtained with scipy's gaussian_kde. + Both for float64 and float32 data types.""" + def _test_productkde_logl_null_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the logl() method of the ProductKDE factor with null values returns the same results as scipy's gaussian_kde. + It trains _df and tests it with _test_df. + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) logl = cpd.logl(_test_df) @@ -248,10 +333,14 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): npdata = _df.loc[:, variables].to_numpy() factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() @@ -266,8 +355,8 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -276,102 +365,157 @@ def _test_productkde_logl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_logl_null_iter(variables, df, df_null) _test_productkde_logl_null_iter(variables, df_float, df_null_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose(cpd.logl(df_null), cpd2.logl(df_null), equal_nan=True) + ), "Order of evidence changes logl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.logl(df_null_float), cpd2.logl(df_null_float), atol=0.0005, equal_nan=True)), "Order of evidence changes logl() result." + assert np.all( + np.isclose( + cpd.logl(df_null_float), + cpd2.logl(df_null_float), + atol=0.0005, + equal_nan=True, + ) + ), "Order of evidence changes logl() result." + def test_productkde_slogl(): + """Tests the slogl() method of the ProductKDE factor. It compares the results with the ones obtained with scipy's gaussian_kde.""" + def _test_productkde_slogl_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the slogl() method of the ProductKDE factor returns the same results as scipy's gaussian_kde. + + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - + factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() - if np.all(_df.dtypes == 'float32'): - assert np.all(np.isclose(cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum(), - atol=5e-3*test_npdata.shape[0])) + if np.all(_df.dtypes == "float32"): + assert np.all( + np.isclose( + cpd.slogl(_test_df), + final_scipy_kde.logpdf(test_npdata.T).sum(), + atol=5e-3 * test_npdata.shape[0], + ) + ) else: - assert np.all(np.isclose(cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum())) + assert np.all( + np.isclose( + cpd.slogl(_test_df), final_scipy_kde.logpdf(test_npdata.T).sum() + ) + ) - test_df = util_test.generate_normal_data(50, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(50, seed=1) + test_df_float = test_df.astype("float32") - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_slogl_iter(variables, df, test_df) _test_productkde_slogl_iter(variables, df_float, test_df_float) - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df), cpd2.slogl(test_df)) + ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005)), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(test_df_float), cpd2.slogl(test_df_float), atol=0.0005) + ), "Order of evidence changes slogl() result." def test_productkde_slogl_null(): def _test_productkde_slogl_null_iter(variables, _df, _test_df): - cpd = pbn.ProductKDE(variables) + """Tests that the slogl() method of the ProductKDE factor with null values returns the same results as scipy's gaussian_kde. + + Args: + variables (list[str]): Dataset variables to use. + _df (pd.DataFrame): Training dataset. + _test_df (pd.DataFrame): Test dataset. + """ + cpd = pbn.ProductKDE(variables, bandwidth_selector=pbn.NormalReferenceRule()) cpd.fit(_df) npdata = _df.loc[:, variables].to_numpy() - + factor = factor_product_kernel(npdata) - final_scipy_kde = gaussian_kde(npdata.T, - bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype)) + final_scipy_kde = gaussian_kde( + npdata.T, + bw_method=lambda gkde: factor * np.eye(npdata.shape[1], dtype=npdata.dtype), + ) final_scipy_kde.cho_cov = np.linalg.cholesky(final_scipy_kde.covariance) - final_scipy_kde.log_det = 2*np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2*np.pi))).sum() + final_scipy_kde.log_det = ( + 2 * np.log(np.diag(final_scipy_kde.cho_cov * np.sqrt(2 * np.pi))).sum() + ) test_npdata = _test_df.loc[:, variables].to_numpy() nan_rows = np.any(np.isnan(test_npdata), axis=1) if npdata.dtype == "float32": - assert np.all(np.isclose(cpd.slogl(_test_df), - np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), - atol=5e-3*test_npdata.shape[0])) + assert np.all( + np.isclose( + cpd.slogl(_test_df), + np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), + atol=5e-3 * test_npdata.shape[0], + ) + ) else: - assert np.all(np.isclose(cpd.slogl(_test_df), - np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)))) + assert np.all( + np.isclose( + cpd.slogl(_test_df), + np.sum(final_scipy_kde.logpdf(test_npdata[~nan_rows].T)), + ) + ) TEST_SIZE = 50 - test_df = util_test.generate_normal_data(TEST_SIZE, seed=1) - test_df_float = test_df.astype('float32') + test_df = generate_normal_data(TEST_SIZE, seed=1) + test_df_float = test_df.astype("float32") np.random.seed(0) a_null = np.random.randint(0, TEST_SIZE, size=10) @@ -380,30 +524,33 @@ def _test_productkde_slogl_null_iter(variables, _df, _test_df): d_null = np.random.randint(0, TEST_SIZE, size=10) df_null = test_df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan df_null_float = test_df_float.copy() - df_null_float.loc[df_null_float.index[a_null], 'a'] = np.nan - df_null_float.loc[df_null_float.index[b_null], 'b'] = np.nan - df_null_float.loc[df_null_float.index[c_null], 'c'] = np.nan - df_null_float.loc[df_null_float.index[d_null], 'd'] = np.nan + df_null_float.loc[df_null_float.index[a_null], "A"] = np.nan + df_null_float.loc[df_null_float.index[b_null], "B"] = np.nan + df_null_float.loc[df_null_float.index[c_null], "C"] = np.nan + df_null_float.loc[df_null_float.index[d_null], "D"] = np.nan - for variables in [['a'], ['b', 'a'], ['c', 'a', 'b'], ['d', 'a', 'b', 'c']]: + for variables in [["A"], ["B", "A"], ["C", "A", "B"], ["D", "A", "B", "C"]]: _test_productkde_slogl_null_iter(variables, df, df_null) _test_productkde_slogl_null_iter(variables, df_float, df_null_float) - - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df) - assert np.all(np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null))), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null), cpd2.slogl(df_null)) + ), "Order of evidence changes slogl() result." - cpd = pbn.ProductKDE(['d', 'a', 'b', 'c']) + cpd = pbn.ProductKDE(["D", "A", "B", "C"]) cpd.fit(df_float) - cpd2 = pbn.ProductKDE(['a', 'c', 'd', 'b']) + cpd2 = pbn.ProductKDE(["A", "C", "D", "B"]) cpd2.fit(df_float) - assert np.all(np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float), atol=0.0005)), "Order of evidence changes slogl() result." + assert np.all( + np.isclose(cpd.slogl(df_null_float), cpd2.slogl(df_null_float), atol=0.0005) + ), "Order of evidence changes slogl() result." diff --git a/tests/factors/discrete/DiscreteFactor_test.py b/tests/factors/discrete/DiscreteFactor_test.py index 134575ee..73434ea9 100644 --- a/tests/factors/discrete/DiscreteFactor_test.py +++ b/tests/factors/discrete/DiscreteFactor_test.py @@ -1,37 +1,51 @@ -import pytest import numpy as np import pandas as pd import pyarrow as pa import pybnesian as pbn -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_discrete_data + +df = generate_discrete_data(DATA_SIZE) -df = util_test.generate_discrete_data_dependent(10000) def test_data_type(): a = pbn.DiscreteFactor("A", []) with pytest.raises(ValueError) as ex: a.data_type() - "DiscreteFactor factor not fitted." in str(ex.value) - - categories = np.asarray(["a1", "a2"]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + assert "DiscreteFactor factor not fitted." in str(ex.value) + + categories = np.asarray(["A1", "A2"]) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) - categories = np.asarray(["a" + str(i) for i in range(1, 129)]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + categories = np.asarray(["A" + str(i) for i in range(1, 129)]) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int8(), pa.string()) - categories = np.asarray(["a" + str(i) for i in range(1, 130)]) - a_values = pd.Categorical(categories[np.random.randint(len(categories), size=100)], categories=categories, ordered=False) - df = pd.DataFrame({'A': a_values}) + categories = np.asarray(["A" + str(i) for i in range(1, 130)]) + a_values = pd.Categorical( + categories[np.random.randint(len(categories), size=100)], + categories=categories, + ordered=False, + ) + df = pd.DataFrame({"A": a_values}) a.fit(df) assert a.data_type() == pa.dictionary(pa.int16(), pa.string()) + def test_fit(): # a = DiscreteFactor('C', ['A', 'B']) - a = pbn.DiscreteFactor('C', []) + a = pbn.DiscreteFactor("C", []) a.fit(df) diff --git a/tests/factors/factor_type_test.py b/tests/factors/factor_type_test.py index 0a551cb1..b3a4184f 100644 --- a/tests/factors/factor_type_test.py +++ b/tests/factors/factor_type_test.py @@ -1,29 +1,29 @@ -import pytest import pybnesian as pbn -from pybnesian import FactorType, Factor +import pytest + def test_factor_type(): - lg1 = pbn.LinearGaussianCPD("a", []) - lg2 = pbn.LinearGaussianCPD("b", ["a"]) - lg3 = pbn.LinearGaussianCPD("c", ["b", "a"]) + lg1 = pbn.LinearGaussianCPD("A", []) + lg2 = pbn.LinearGaussianCPD("B", ["A"]) + lg3 = pbn.LinearGaussianCPD("C", ["B", "A"]) assert lg1.type() == pbn.LinearGaussianCPDType() assert lg1.type() == lg2.type() assert lg1.type() == lg3.type() assert lg2.type() == lg3.type() - c1 = pbn.CKDE("a", []) - c2 = pbn.CKDE("b", ["a"]) - c3 = pbn.CKDE("c", ["b", "a"]) + c1 = pbn.CKDE("A", []) + c2 = pbn.CKDE("B", ["A"]) + c3 = pbn.CKDE("C", ["B", "A"]) assert c1.type() == pbn.CKDEType() assert c1.type() == c2.type() assert c1.type() == c3.type() assert c2.type() == c3.type() - d1 = pbn.DiscreteFactor("a", []) - d2 = pbn.DiscreteFactor("b", ["a"]) - d3 = pbn.DiscreteFactor("c", ["b", "a"]) + d1 = pbn.DiscreteFactor("A", []) + d2 = pbn.DiscreteFactor("B", ["A"]) + d3 = pbn.DiscreteFactor("C", ["B", "A"]) assert d1.type() == pbn.DiscreteFactorType() assert d1.type() == d2.type() @@ -34,10 +34,11 @@ def test_factor_type(): assert lg1.type() != d1.type() assert c1.type() != d1.type() + def test_new_factor_type(): - class A(FactorType): + class A(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) a1 = A() a2 = A() @@ -47,9 +48,9 @@ def __init__(self): assert a1 == a3 assert a2 == a3 - class B(FactorType): + class B(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) b1 = B() b2 = B() @@ -61,24 +62,25 @@ def __init__(self): assert a1 != b1 + def test_factor_defined_factor_type(): - class F_type(FactorType): + class F_type(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) def __str__(self): return "FType" - class F(Factor): + class F(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) def type(self): return F_type() - f1 = F("a", []) - f2 = F("b", ["a"]) - f3 = F("c", ["a", "b"]) + f1 = F("A", []) + f2 = F("B", ["A"]) + f3 = F("C", ["A", "B"]) assert f1.type() == f2.type() assert f1.type() == f3.type() @@ -86,31 +88,33 @@ def type(self): assert str(f1.type()) == str(f2.type()) == str(f3.type()) == "FType" - dummy_network = pbn.GaussianNetwork(["a", "b", "c", "d"]) + dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) with pytest.raises(RuntimeError) as ex: - f4 = f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) - assert 'Tried to call pure virtual function "FactorType::new_factor"' in str(ex.value) + f1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) + assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( + ex.value + ) - class G_type(FactorType): + class G_type(pbn.FactorType): def __init__(self): - FactorType.__init__(self) - + pbn.FactorType.__init__(self) + def new_factor(self, model, variable, evidence): return G(variable, evidence) def __str__(self): return "GType" - class G(Factor): + class G(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) def type(self): return G_type() - g1 = G("a", []) - g2 = G("b", ["a"]) - g3 = G("c", ["a", "b"]) + g1 = G("A", []) + g2 = G("B", ["A"]) + g3 = G("C", ["A", "B"]) assert g1.type() == g2.type() assert g1.type() == g3.type() @@ -120,8 +124,8 @@ def type(self): assert str(g1.type()) == str(g2.type()) == str(g3.type()) == "GType" - g4 = g1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) + g4 = g1.type().new_factor(dummy_network, "D", ["A", "B", "C"]) assert g1.type() == g4.type() - assert g4.variable() == "d" - assert g4.evidence() == ["a", "b", "c"] \ No newline at end of file + assert g4.variable() == "D" + assert g4.evidence() == ["A", "B", "C"] diff --git a/tests/helpers/data.py b/tests/helpers/data.py new file mode 100644 index 00000000..b0c2a927 --- /dev/null +++ b/tests/helpers/data.py @@ -0,0 +1,576 @@ +import numpy as np +import pandas as pd + +# Constants +TRUE_CLASS_LABEL = "attack_label" +SUPER_PARENT = "A" +DATA_SIZE = 10000 +SAMPLE_SIZE = 100 + +N_NEIGHBORS = 3 +SEED = 0 + + +def generate_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships. + The relationships are as follows: + - A ~ N(3, 0.5) + - B ~ N(2.5 + 1.65 * A, 2) + - C ~ N(-4.2 - 1.2 * A + 3.2 * B, 0.75) + - D ~ N(1.5 - 0.9 * A + 5.6 * B + 0.3 * C, 0.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + a_array = np.random.normal(3, 0.5, size=size) + b_array = 2.5 + 1.65 * a_array + np.random.normal(0, 2, size=size) + c_array = ( + -4.2 - 1.2 * a_array + 3.2 * b_array + np.random.normal(0, 0.75, size=size) + ) + d_array = ( + 1.5 + - 0.9 * a_array + + 5.6 * b_array + + 0.3 * c_array + + np.random.normal(0, 0.5, size=size) + ) + df = pd.DataFrame({"A": a_array, "B": b_array, "C": c_array, "D": d_array}) + + return df + + +def generate_normal_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships and independent variables. + The relationships are as follows: + - A ~ N(3, 0.5) + - B ~ N(2.5, 2) + - C ~ N(-4.2, 0.75) + - D ~ N(1.5, 0.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + a_array = np.random.normal(3, 0.5, size=size) + b_array = np.random.normal(2.5, 2, size=size) + c_array = np.random.normal(-4.2, 0.75, size=size) + d_array = np.random.normal(1.5, 0.5, size=size) + + df = pd.DataFrame({"A": a_array, "B": b_array, "C": c_array, "D": d_array}) + return df + + +def generate_non_normal_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of uniformly distributed data with non-linear relationships. + The relationships are as follows: + - A ~ U(0, 10) + - B ~ U(5, 15) + - C ~ sin(A) + cos(B) + U(-1, 1) + - D ~ exp(A / 10) + log(B + 1) + U(-0.5, 0.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + # Generate uniformly distributed data + a_values = np.random.uniform(0, 10, size) + b_values = np.random.uniform(5, 15, size) + + # Generate non-linear relationships + c_values = np.sin(a_values) + np.cos(b_values) + np.random.uniform(-1, 1, size) + d_values = ( + np.exp(a_values / 10) + + np.log(b_values + 1) + + np.random.uniform(-0.5, 0.5, size) + ) + + # DataFrame + df = pd.DataFrame( + { + "A": a_values, + "B": b_values, + "C": c_values, + "D": d_values, + } + ) + return df + + +def generate_discrete_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with dependent variables. + The relationships are as follows: + - A ~ Categorical(0.75, 0.25) + - B ~ Categorical(0.33, 0.33, 0.34) if A = A1, else Categorical(0, 0.8, 0.2) + - C ~ Categorical(0.5, 0.5) if A = A1 and B = B1, else Categorical(0.75, 0.25) if A = A1 and B = B2, else Categorical(0.2, 0.8) if A = A1 and B = B3, else Categorical(1, 0) if A = A2 and B = B1, else Categorical(0, 1) if A = A2 and B = B2, else Categorical(0.01, 0.99) if A = A2 and B = B3 + - D ~ Categorical(0.25, 0.25, 0.25, 0.25) if C = C1, else Categorical(0.7, 0, 0.15, 0.15) if C = C2 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + d_dict = np.asarray(["D1", "D2", "D3", "D4"]) + + a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] + b_values = np.empty(size, dtype=object) + c_values = np.empty(size, dtype=object) + d_values = np.empty(size, dtype=object) + + # Indices + a1_indices = a_values == "A1" + a2_indices = a_values == "A2" + + # Sampling B + b_values[a1_indices] = b_dict[ + np.random.choice(b_dict.size, a1_indices.sum(), p=[0.33, 0.33, 0.34]) + ] + b_values[a2_indices] = b_dict[ + np.random.choice(b_dict.size, a2_indices.sum(), p=[0, 0.8, 0.2]) + ] + + # Sampling C + for i in range(size): + if a_values[i] == "A1" and b_values[i] == "B1": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.5, 0.5])] + elif a_values[i] == "A1" and b_values[i] == "B2": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.75, 0.25])] + elif a_values[i] == "A1" and b_values[i] == "B3": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.2, 0.8])] + elif a_values[i] == "A2" and b_values[i] == "B1": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[1, 0])] + elif a_values[i] == "A2" and b_values[i] == "B2": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0, 1])] + elif a_values[i] == "A2" and b_values[i] == "B3": + c_values[i] = c_dict[np.random.choice(c_dict.size, p=[0.01, 0.99])] + + # Sampling D + for i in range(size): + if c_values[i] == "C1": + d_values[i] = d_dict[ + np.random.choice(d_dict.size, p=[0.25, 0.25, 0.25, 0.25]) + ] + elif c_values[i] == "C2": + d_values[i] = d_dict[np.random.choice(d_dict.size, p=[0.7, 0, 0.15, 0.15])] + + # DataFrame + df = pd.DataFrame( + { + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": pd.Series(c_values, dtype="category"), + "D": pd.Series(d_values, dtype="category"), + } + ) + return df + + +def generate_discrete_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with uniform distributions. + The relationships are as follows: + - A ~ Categorical(A1, A2) + - B ~ Categorical(B1, B2, B3) + - C ~ Categorical(C1, C2) + - D ~ Categorical(D1, D2, D3, D4) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + d_dict = np.asarray(["D1", "D2", "D3", "D4"]) + + # DataFrame + df = pd.DataFrame( + { + "A": a_dict[np.random.randint(0, a_dict.size, size=size)], + "B": b_dict[np.random.randint(0, b_dict.size, size=size)], + "C": c_dict[np.random.randint(0, c_dict.size, size=size)], + "D": d_dict[np.random.randint(0, d_dict.size, size=size)], + }, + dtype="category", + ) + return df + + +def generate_hybrid_data(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of hybrid data with discrete and continuous variables. + The relationships are as follows: + - A ~ Categorical(0.75, 0.25) + - B ~ Categorical(0.3, 0.4, 0.3) if A = A1, else Categorical(0.2, 0.5, 0.3) + - C ~ N(-4.2, 0.75) + - D ~ N(1, 0.75) if A = A1 and B = B1, else N(-2 + C, 2) if A = A1 and B = B2, else N(-1 + 3 * C, 0.25) if A = A1 and B = B3, else N(2, 1) if A = A2 and B = B1, else N(3.5 - 1.2 * C, 1) if A = A2 and B = B2, else N(4.8 - 2 * C, 1.5) if A = A2 and B = B3 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + a_dict = np.asarray(["A1", "A2"]) + a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] + + b_dict = np.asarray(["B1", "B2", "B3"]) + b_values = b_dict[np.random.choice(b_dict.size, size, p=[0.3, 0.4, 0.3])] + + c_values = -4.2 + np.random.normal(0, 0.75, size=size) + d_values = np.empty_like(c_values) + + # Indices + a1b1_indices = np.logical_and(a_values == "A1", b_values == "B1") + a1b2_indices = np.logical_and(a_values == "A1", b_values == "B2") + a1b3_indices = np.logical_and(a_values == "A1", b_values == "B3") + a2b1_indices = np.logical_and(a_values == "A2", b_values == "B1") + a2b2_indices = np.logical_and(a_values == "A2", b_values == "B2") + a2b3_indices = np.logical_and(a_values == "A2", b_values == "B3") + + # Sampling + d_values[a1b1_indices] = np.random.normal(1, 0.75, size=a1b1_indices.sum()) + d_values[a1b2_indices] = ( + -2 + c_values[a1b2_indices] + np.random.normal(0, 2, size=a1b2_indices.sum()) + ) + d_values[a1b3_indices] = ( + -1 + + 3 * c_values[a1b3_indices] + + np.random.normal(0, 0.25, size=a1b3_indices.sum()) + ) + d_values[a2b1_indices] = np.random.normal(2, 1, size=a2b1_indices.sum()) + d_values[a2b2_indices] = ( + 3.5 + + -1.2 * c_values[a2b2_indices] + + np.random.normal(0, 1, size=a2b2_indices.sum()) + ) + d_values[a2b3_indices] = ( + 4.8 + + -2 * c_values[a2b3_indices] + + np.random.normal(0, 1.5, size=a2b3_indices.sum()) + ) + + # DataFrame + df = pd.DataFrame( + { + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": c_values, + "D": d_values, + } + ) + return df + + +def generate_hybrid_data_independent(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of hybrid data with independent discrete and continuous variables. + The relationships are as follows: + - D2 ~ Categorical(0.5, 0.5) + - D3 ~ Categorical(0.33, 0.34, 0.33) + - D4 ~ Categorical(0.25, 0.25, 0.25, 0.25) + - D5 ~ Categorical(0.2, 0.2, 0.2, 0.2, 0.2) + - D6 ~ Categorical(0.166, 0.166, 0.166, 0.166, 0.166, 0.17) + - C1 ~ N(-4.2, 0.75) + - C2 ~ N(1, 2) + - C3 ~ N(2, 0.7) + - C4 ~ N(-3, 2.5) + - C5 ~ N(-1.2, 0.5) + - C6 ~ N(3, 1.5) + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + # Sampling + d2_dict = np.asarray(["A1", "A2"]) + d2_values = d2_dict[np.random.choice(d2_dict.size, size, p=[0.5, 0.5])] + + d3_dict = np.asarray(["B1", "B2", "B3"]) + d3_values = d3_dict[np.random.choice(d3_dict.size, size, p=[0.33, 0.34, 0.33])] + + d4_dict = np.asarray(["C1", "C2", "C3", "C4"]) + d4_values = d4_dict[ + np.random.choice(d4_dict.size, size, p=[0.25, 0.25, 0.25, 0.25]) + ] + + d5_dict = np.asarray(["D1", "D2", "D3", "D4", "D5"]) + d5_values = d5_dict[ + np.random.choice(d5_dict.size, size, p=[0.2, 0.2, 0.2, 0.2, 0.2]) + ] + + d6_dict = np.asarray(["e1", "e2", "e3", "e4", "e5", "e6"]) + d6_values = d6_dict[ + np.random.choice( + d6_dict.size, size, p=[0.166, 0.166, 0.166, 0.166, 0.166, 0.17] + ) + ] + + c1_values = -4.2 + np.random.normal(0, 0.75, size=size) + c2_values = np.random.normal(1, 2, size=size) + c3_values = np.random.normal(2, 0.7, size=size) + c4_values = np.random.normal(-3, 2.5, size=size) + c5_values = np.random.normal(-1.2, 0.5, size=size) + c6_values = np.random.normal(3, 1.5, size=size) + + # DataFrame + df = pd.DataFrame( + { + "D2": pd.Series(d2_values, dtype="category"), + "D3": pd.Series(d3_values, dtype="category"), + "D4": pd.Series(d4_values, dtype="category"), + "D5": pd.Series(d5_values, dtype="category"), + "D6": pd.Series(d6_values, dtype="category"), + "C1": c1_values, + "C2": c2_values, + "C3": c3_values, + "C4": c4_values, + "C5": c5_values, + "C6": c6_values, + } + ) + return df + + +def generate_discrete_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of discrete data with dependent variables and a true label. + The relationships are as follows: + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ Categorical(0.6, 0.4) if TRUE_CLASS_LABEL = class1, else Categorical(0.8, 0.2) if TRUE_CLASS_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_CLASS_LABEL = class3 + - B ~ Categorical(0.5, 0.3, 0.2) if TRUE_CLASS_LABEL = class1, else Categorical(0.2, 0.5, 0.3) if TRUE_CLASS_LABEL = class2, else Categorical(0.3, 0.3, 0.4) if TRUE_CLASS_LABEL = class3 + - C ~ Categorical(0.7, 0.3) if TRUE_CLASS_LABEL = class1, else Categorical(0.4, 0.6) if TRUE_CLASS_LABEL = class2, else Categorical(0.5, 0.5) if TRUE_CLASS_LABEL = class3 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + class_dict = np.asarray(["class1", "class2", "class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_dict = np.asarray(["A1", "A2"]) + b_dict = np.asarray(["B1", "B2", "B3"]) + c_dict = np.asarray(["C1", "C2"]) + + a_values = np.empty(size, dtype=object) + b_values = np.empty(size, dtype=object) + c_values = np.empty(size, dtype=object) + + # Indices + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" + + # Sampling + a_values[class1_indices] = a_dict[ + np.random.choice(a_dict.size, class1_indices.sum(), p=[0.6, 0.4]) + ] + a_values[class2_indices] = a_dict[ + np.random.choice(a_dict.size, class2_indices.sum(), p=[0.8, 0.2]) + ] + a_values[class3_indices] = a_dict[ + np.random.choice(a_dict.size, class3_indices.sum(), p=[0.5, 0.5]) + ] + + b_values[class1_indices] = b_dict[ + np.random.choice(b_dict.size, class1_indices.sum(), p=[0.5, 0.3, 0.2]) + ] + b_values[class2_indices] = b_dict[ + np.random.choice(b_dict.size, class2_indices.sum(), p=[0.2, 0.5, 0.3]) + ] + b_values[class3_indices] = b_dict[ + np.random.choice(b_dict.size, class3_indices.sum(), p=[0.3, 0.3, 0.4]) + ] + + c_values[class1_indices] = c_dict[ + np.random.choice(c_dict.size, class1_indices.sum(), p=[0.7, 0.3]) + ] + c_values[class2_indices] = c_dict[ + np.random.choice(c_dict.size, class2_indices.sum(), p=[0.4, 0.6]) + ] + c_values[class3_indices] = c_dict[ + np.random.choice(c_dict.size, class3_indices.sum(), p=[0.5, 0.5]) + ] + + # DataFrame + df = pd.DataFrame( + { + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), + "A": pd.Series(a_values, dtype="category"), + "B": pd.Series(b_values, dtype="category"), + "C": pd.Series(c_values, dtype="category"), + } + ) + return df + + +def generate_normal_data_classification(size: int, seed: int = SEED) -> pd.DataFrame: + """Generates a DataFrame of normally distributed data with linear Gaussian relationships and a true label. + The relationships are as follows: + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ N(-4.2, 0.75) + - B ~ N(0, 0.25) if TRUE_CLASS_LABEL = class1, else N(1, 0.5) if TRUE_CLASS_LABEL = class2, else N(2, 1) if TRUE_CLASS_LABEL = class3 + - C ~ N(-2 + 2 * B, 1) if TRUE_CLASS_LABEL = class1, else N(1 + 0.5 * B, 0.5) if TRUE_CLASS_LABEL = class2, else N(3 + 3 * B, 0.25) if TRUE_CLASS_LABEL = class3 + size (int): The sample + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + # Initialization + np.random.seed(seed) + + class_dict = np.asarray(["class1", "class2", "class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_values = -4.2 + np.random.normal(0, 0.75, size=size) + + b_values = np.empty_like(a_values) + c_values = np.empty_like(a_values) + + # Indices + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" + + # Sampling + # b_values based on class_values + b_values[class1_indices] = np.random.normal(0, 0.25, size=class1_indices.sum()) + b_values[class2_indices] = np.random.normal(1, 0.5, size=class2_indices.sum()) + b_values[class3_indices] = np.random.normal(2, 1, size=class3_indices.sum()) + + # c_values based on class_values and b_values + c_values[class1_indices] = ( + -2 + + 2 * b_values[class1_indices] + + np.random.normal(0, 1, size=class1_indices.sum()) + ) + c_values[class2_indices] = ( + 1 + + 0.5 * b_values[class2_indices] + + np.random.normal(0, 0.5, size=class2_indices.sum()) + ) + c_values[class3_indices] = ( + 3 + + 3 * b_values[class3_indices] + + np.random.normal(0, 0.25, size=class3_indices.sum()) + ) + + # DataFrame + df = pd.DataFrame( + { + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), + "A": a_values, + "B": b_values, + "C": c_values, + } + ) + return df + + +def generate_non_normal_data_classification( + size: int, seed: int = SEED +) -> pd.DataFrame: + """Generates a DataFrame of uniformly distributed data with non-linear relationships and a true label. + The relationships are as follows: + - TRUE_CLASS_LABEL ~ Categorical(0.3, 0.4, 0.3) + - A ~ U(0, 10) + - B ~ U(5, 15) if TRUE_CLASS_LABEL = class1, else U(10, 20) if TRUE_CLASS_LABEL = class2, else U(15, 25) if TRUE_CLASS_LABEL = class3 + - C ~ sin(A) + cos(B) + U(-1, 1) if TRUE_CLASS_LABEL = class1, else exp(A / 10) + log(B + 1) + U(-0.5, 0.5) if TRUE_CLASS_LABEL = class2, else A * B + U(-2, 2) if TRUE_CLASS_LABEL = class3 + + Args: + size (int): The sample size. + seed (int, optional): The seed for random sampling. Defaults to 0. + + Returns: + pd.DataFrame: The DataFrame. + """ + np.random.seed(seed) + + class_dict = np.asarray(["class1", "class2", "class3"]) + class_values = class_dict[ + np.random.choice(class_dict.size, size, p=[0.3, 0.4, 0.3]) + ] + + a_values = np.random.uniform(0, 10, size) + + b_values = np.empty_like(a_values) + c_values = np.empty_like(a_values) + + # Indices + class1_indices = class_values == "class1" + class2_indices = class_values == "class2" + class3_indices = class_values == "class3" + + # Sampling + b_values[class1_indices] = np.random.uniform(5, 15, size=class1_indices.sum()) + b_values[class2_indices] = np.random.uniform(10, 20, size=class2_indices.sum()) + b_values[class3_indices] = np.random.uniform(15, 25, size=class3_indices.sum()) + + c_values[class1_indices] = ( + np.sin(a_values[class1_indices]) + + np.cos(b_values[class1_indices]) + + np.random.uniform(-1, 1, size=class1_indices.sum()) + ) + c_values[class2_indices] = ( + np.exp(a_values[class2_indices] / 10) + + np.log(b_values[class2_indices] + 1) + + np.random.uniform(-0.5, 0.5, size=class2_indices.sum()) + ) + c_values[class3_indices] = a_values[class3_indices] * b_values[ + class3_indices + ] + np.random.uniform(-2, 2, size=class3_indices.sum()) + + # DataFrame + df = pd.DataFrame( + { + TRUE_CLASS_LABEL: pd.Series(class_values, dtype="category"), + "A": a_values, + "B": b_values, + "C": c_values, + } + ) + return df diff --git a/tests/helpers/util_test.py b/tests/helpers/util_test.py deleted file mode 100644 index 4f523fdf..00000000 --- a/tests/helpers/util_test.py +++ /dev/null @@ -1,175 +0,0 @@ -import numpy as np -import pandas as pd - - -def generate_normal_data(size, seed=0): - np.random.seed(seed) - - a_array = np.random.normal(3, 0.5, size=size) - b_array = 2.5 + 1.65*a_array + np.random.normal(0, 2, size=size) - c_array = -4.2 - 1.2*a_array + 3.2*b_array + np.random.normal(0, 0.75, size=size) - d_array = 1.5 - 0.9*a_array + 5.6*b_array + 0.3 * c_array + np.random.normal(0, 0.5, size=size) - - - return pd.DataFrame({ - 'a': a_array, - 'b': b_array, - 'c': c_array, - 'd': d_array - }) - - -def generate_normal_data_indep(size, seed=0): - np.random.seed(seed) - - a_array = np.random.normal(3, 0.5, size=size) - b_array = np.random.normal(2.5, 2, size=size) - c_array = -4.2 - 1.2*a_array + 3.2*b_array + np.random.normal(0, 0.75, size=size) - d_array = 1.5 - 0.3 * c_array + np.random.normal(0, 0.5, size=size) - - - return pd.DataFrame({ - 'a': a_array, - 'b': b_array, - 'c': c_array, - 'd': d_array - }) - - -def generate_discrete_data_uniform(size, seed=0): - np.random.seed(seed) - - a_dict = np.asarray(['a1', 'a2']) - b_dict = np.asarray(['b1', 'b2', 'b3']) - c_dict = np.asarray(['c1', 'c2']) - d_dict = np.asarray(['d1', 'd2', 'd3', 'd4']) - - return pd.DataFrame({'A': a_dict[np.random.randint(0, a_dict.size, size=size)], - 'B': b_dict[np.random.randint(0, b_dict.size, size=size)], - 'C': c_dict[np.random.randint(0, c_dict.size, size=size)], - 'D': d_dict[np.random.randint(0, d_dict.size, size=size)] - }, dtype='category') - - -def generate_discrete_data_dependent(size, seed=0): - np.random.seed(seed) - - a_dict = np.asarray(['a1', 'a2']) - b_dict = np.asarray(['b1', 'b2', 'b3']) - c_dict = np.asarray(['c1', 'c2']) - d_dict = np.asarray(['d1', 'd2', 'd3', 'd4']) - - a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] - b_values = np.empty_like(a_values) - c_values = np.empty_like(a_values) - d_values = np.empty_like(a_values) - - a1_indices = a_values == 'a1' - - b_values[a1_indices] = b_dict[np.random.choice(b_dict.size, np.sum(a1_indices), p=[0.33, 0.33, 0.34])] - b_values[~a1_indices] = b_dict[np.random.choice(b_dict.size, np.sum(~a1_indices), p=[0, 0.8, 0.2])] - - a1b1_indices = np.logical_and(a_values == 'a1', b_values == 'b1') - a1b2_indices = np.logical_and(a_values == 'a1', b_values == 'b2') - a1b3_indices = np.logical_and(a_values == 'a1', b_values == 'b3') - a2b1_indices = np.logical_and(a_values == 'a2', b_values == 'b1') - a2b2_indices = np.logical_and(a_values == 'a2', b_values == 'b2') - a2b3_indices = np.logical_and(a_values == 'a2', b_values == 'b3') - - c_values[a1b1_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b1_indices), p=[0.5, 0.5])] - c_values[a1b2_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b2_indices), p=[0.75, 0.25])] - c_values[a1b3_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a1b3_indices), p=[0.2, 0.8])] - c_values[a2b1_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b1_indices), p=[1, 0])] - c_values[a2b2_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b2_indices), p=[0, 1])] - c_values[a2b3_indices] = c_dict[np.random.choice(c_dict.size, np.sum(a2b3_indices), p=[0.01, 0.99])] - - c1_indices = c_values == 'c1' - c2_indices = c_values == 'c2' - - d_values[c1_indices] = d_dict[np.random.choice(d_dict.size, np.sum(c1_indices), p=[0.25, 0.25, 0.25, 0.25])] - d_values[c2_indices] = d_dict[np.random.choice(d_dict.size, np.sum(c2_indices), p=[0.7, 0, 0.15, 0.15])] - - return pd.DataFrame({'A': a_values, - 'B': b_values, - 'C': c_values, - 'D': d_values - }, dtype='category') - - -def generate_hybrid_data(size, seed=0): - # - # Generate data from: - # - # A B C - # \ | / - # \ | / - # v - # D - np.random.seed(seed) - - a_dict = np.asarray(['a1', 'a2']) - a_values = a_dict[np.random.choice(a_dict.size, size, p=[0.75, 0.25])] - - b_dict = np.asarray(['b1', 'b2', 'b3']) - b_values = b_dict[np.random.choice(b_dict.size, size, p=[0.3, 0.4, 0.3])] - - c_values = -4.2 + np.random.normal(0, 0.75, size=size) - - a1b1_indices = np.logical_and(a_values == 'a1', b_values == 'b1') - a1b2_indices = np.logical_and(a_values == 'a1', b_values == 'b2') - a1b3_indices = np.logical_and(a_values == 'a1', b_values == 'b3') - a2b1_indices = np.logical_and(a_values == 'a2', b_values == 'b1') - a2b2_indices = np.logical_and(a_values == 'a2', b_values == 'b2') - a2b3_indices = np.logical_and(a_values == 'a2', b_values == 'b3') - - d_values = np.empty_like(c_values) - d_values[a1b1_indices] = np.random.normal(1, 0.75, size=a1b1_indices.sum()) - d_values[a1b2_indices] = -2 + c_values[a1b2_indices] + np.random.normal(0, 2, size=a1b2_indices.sum()) - d_values[a1b3_indices] = -1 + 3*c_values[a1b3_indices] + np.random.normal(0, 0.25, size=a1b3_indices.sum()) - d_values[a2b1_indices] = np.random.normal(2, 1, size=a2b1_indices.sum()) - d_values[a2b2_indices] = 3.5 + -1.2*c_values[a2b2_indices] + np.random.normal(0, 1, size=a2b2_indices.sum()) - d_values[a2b3_indices] = 4.8 + -2*c_values[a2b3_indices] + np.random.normal(0, 1.5, size=a2b3_indices.sum()) - - return pd.DataFrame({'A': pd.Series(a_values, dtype='category'), - 'B': pd.Series(b_values, dtype='category'), - 'C': c_values, - 'D': d_values - }) - -def generate_indep_hybrid_data(size, seed=0): - np.random.seed(seed) - - d2_dict = np.asarray(['a1', 'a2']) - d2_values = d2_dict[np.random.choice(d2_dict.size, size, p=[0.5, 0.5])] - - d3_dict = np.asarray(['b1', 'b2', 'b3']) - d3_values = d3_dict[np.random.choice(d3_dict.size, size, p=[0.33, 0.34, 0.33])] - - d4_dict = np.asarray(['c1', 'c2', 'c3', 'c4']) - d4_values = d4_dict[np.random.choice(d4_dict.size, size, p=[0.25, 0.25, 0.25, 0.25])] - - d5_dict = np.asarray(['d1', 'd2', 'd3', 'd4', 'd5']) - d5_values = d5_dict[np.random.choice(d5_dict.size, size, p=[0.2, 0.2, 0.2, 0.2, 0.2])] - - d6_dict = np.asarray(['e1', 'e2', 'e3', 'e4', 'e5', 'e6']) - d6_values = d6_dict[np.random.choice(d6_dict.size, size, p=[0.166, 0.166, 0.166, 0.166, 0.166, 0.17])] - - c1_values = -4.2 + np.random.normal(0, 0.75, size=size) - c2_values = np.random.normal(1, 2, size=size) - c3_values = np.random.normal(2, 0.7, size=size) - c4_values = np.random.normal(-3, 2.5, size=size) - c5_values = np.random.normal(-1.2, 0.5, size=size) - c6_values = np.random.normal(3, 1.5, size=size) - - return pd.DataFrame({'D2': pd.Series(d2_values, dtype='category'), - 'D3': pd.Series(d3_values, dtype='category'), - 'D4': pd.Series(d4_values, dtype='category'), - 'D5': pd.Series(d5_values, dtype='category'), - 'D6': pd.Series(d6_values, dtype='category'), - 'C1': c1_values, - 'C2': c2_values, - 'C3': c3_values, - 'C4': c4_values, - 'C5': c5_values, - 'C6': c6_values, - }) \ No newline at end of file diff --git a/tests/learning/algorithms/constraint_test.py b/tests/learning/algorithms/constraint_test.py index 91fca663..56edf443 100644 --- a/tests/learning/algorithms/constraint_test.py +++ b/tests/learning/algorithms/constraint_test.py @@ -1,44 +1,58 @@ -from pybnesian import PartiallyDirectedGraph, MeekRules +import pybnesian as pbn + def test_meek_rule1(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr1 = PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y")], [("Y", "Z")]) + gr1 = pbn.PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y")], [("Y", "Z")]) - assert MeekRules.rule1(gr1) + assert pbn.MeekRules.rule1(gr1) assert gr1.num_edges() == 0 - assert set(gr1.arcs()) == set([('X', 'Y'), ('Y', 'Z')]) + assert set(gr1.arcs()) == set([("X", "Y"), ("Y", "Z")]) + + assert not pbn.MeekRules.rule1(gr1) - assert not MeekRules.rule1(gr1) def test_meek_rule2(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr2 = PartiallyDirectedGraph(["X", "Y", "Z"], [("X", "Y"), ("Y", "Z")], [("X", "Z")]) + gr2 = pbn.PartiallyDirectedGraph( + ["X", "Y", "Z"], [("X", "Y"), ("Y", "Z")], [("X", "Z")] + ) - assert MeekRules.rule2(gr2) + assert pbn.MeekRules.rule2(gr2) assert gr2.num_edges() == 0 - assert set(gr2.arcs()) == set([('X', 'Y'), ('Y', 'Z'), ('X', 'Z')]) - assert not MeekRules.rule2(gr2) + assert set(gr2.arcs()) == set([("X", "Y"), ("Y", "Z"), ("X", "Z")]) + assert not pbn.MeekRules.rule2(gr2) + def test_meek_rule3(): # From Koller Chapter 3.4, Figure 3.12, pag 89. - gr3 = PartiallyDirectedGraph(["X", "Y1", "Y2", "Z"], [("Y1", "Z"), ("Y2", "Z")], [("X", "Y1"), ("X", "Y2"), ("X", "Z")]) + gr3 = pbn.PartiallyDirectedGraph( + ["X", "Y1", "Y2", "Z"], + [("Y1", "Z"), ("Y2", "Z")], + [("X", "Y1"), ("X", "Y2"), ("X", "Z")], + ) + + assert pbn.MeekRules.rule3(gr3) + assert set(gr3.edges()) == set([("X", "Y1"), ("X", "Y2")]) + assert set(gr3.arcs()) == set([("X", "Z"), ("Y1", "Z"), ("Y2", "Z")]) + assert not pbn.MeekRules.rule3(gr3) - assert MeekRules.rule3(gr3) - assert set(gr3.edges()) == set([('X', 'Y1'), ('X', 'Y2')]) - assert set(gr3.arcs()) == set([('X', 'Z'), ('Y1', 'Z'), ('Y2', 'Z')]) - assert not MeekRules.rule3(gr3) def test_meek_sequential(): # From Koller Chapter 3.4, Figure 3.13, pag 90. - koller = PartiallyDirectedGraph(["A", "B", "C", "D", "E", "F", "G"], - [("B", "E"), ("C", "E")], - [("A", "B"), ("B", "D"), ("C", "F"), ("E", "F"), ("F", "G")]) + koller = pbn.PartiallyDirectedGraph( + ["A", "B", "C", "D", "E", "F", "G"], + [("B", "E"), ("C", "E")], + [("A", "B"), ("B", "D"), ("C", "F"), ("E", "F"), ("F", "G")], + ) changed = True while changed: changed = False - changed = changed or MeekRules.rule1(koller) - changed = changed or MeekRules.rule2(koller) - changed = changed or MeekRules.rule3(koller) - - assert set(koller.edges()) == set([('A', 'B'), ('B', 'D')]) - assert set(koller.arcs()) == set([('B', 'E'), ('C', 'E'), ('E', 'F'), ('C', 'F'), ('F', 'G')]) \ No newline at end of file + changed = changed or pbn.MeekRules.rule1(koller) + changed = changed or pbn.MeekRules.rule2(koller) + changed = changed or pbn.MeekRules.rule3(koller) + + assert set(koller.edges()) == set([("A", "B"), ("B", "D")]) + assert set(koller.arcs()) == set( + [("B", "E"), ("C", "E"), ("E", "F"), ("C", "F"), ("F", "G")] + ) diff --git a/tests/learning/algorithms/hillclimbing_test.py b/tests/learning/algorithms/hillclimbing_test.py index 4b9ca490..902adfad 100644 --- a/tests/learning/algorithms/hillclimbing_test.py +++ b/tests/learning/algorithms/hillclimbing_test.py @@ -1,9 +1,53 @@ import numpy as np import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork -import util_test +from helpers.data import generate_normal_data + +df = generate_normal_data(1000) + + +# TODO: Add tests for normal data with dependencies +# dep_df = generate_normal_data_dep(1000) + + +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): + def __init__(self): + pbn.BayesianNetworkType.__init__(self) + + def is_homogeneous(self): + return True + + def default_node_type(self): + return pbn.LinearGaussianCPDType() + + def can_have_arc(self, model, source, target): + return "A" in source + + def new_bn(self, nodes): + return NewBN(nodes) + + def __str__(self): + return "MyRestrictedGaussianNetworkType" + + +class NewBN(pbn.BayesianNetwork): + def __init__(self, variables, arcs=None): + if arcs is None: + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) + else: + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) + + self.extra_data = "extra" + + def __getstate_extra__(self): + return self.extra_data + + def __setstate_extra__(self, extra): + self.extra_data = extra -df = util_test.generate_normal_data(1000) def test_hc_estimate(): bic = pbn.BIC(df) @@ -11,11 +55,11 @@ def test_hc_estimate(): start = pbn.GaussianNetwork(column_names) # Check algorithm with BN with nodes removed. - column_names.insert(1, 'e') - column_names.insert(3, 'f') + column_names.insert(1, "E") + column_names.insert(3, "F") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_node('f') + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_node("F") arc_set = pbn.ArcOperatorSet() @@ -38,39 +82,56 @@ def test_hc_estimate(): reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) + res_removed = hc.estimate( + arc_set, + bic, + start_removed_nodes, + max_iters=1, + arc_blacklist=[added_arc_removed], + ) assert res.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert added_arc_removed == reversed_arc_removed - assert np.isclose(op_delta, bic.local_score(res, added_arc[1], [added_arc[0]]) - bic.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - bic.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc[1], [added_arc[0]]) + - bic.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - bic.local_score(res, added_arc_removed[1], []), + ) res = hc.estimate(arc_set, bic, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() - # Can't compare models because the arcs could be oriented in different direction, + # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, bic, start, verbose=False) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, verbose=False) + def test_hc_conditional_estimate(): bic = pbn.BIC(df) column_names = list(df.columns.values) start = pbn.ConditionalGaussianNetwork(column_names[2:], column_names[:2]) - + nodes = column_names[2:] - nodes.insert(1, 'e') + nodes.insert(1, "E") interface_nodes = column_names[:2] - interface_nodes.insert(1, 'f') + interface_nodes.insert(1, "F") start_removed_nodes = pbn.ConditionalGaussianNetwork(nodes, interface_nodes) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_interface_node('f') - + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_interface_node("F") + arc_set = pbn.ArcOperatorSet() hc = pbn.GreedyHillClimbing() @@ -79,37 +140,50 @@ def test_hc_conditional_estimate(): added_arc = res.arcs()[0] op_delta = bic.score(res) - bic.score(start) - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1, verbose=False) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, max_iters=1, verbose=False + ) assert res_removed.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[::-1] assert np.isclose(op_delta, bic.score(res_removed) - bic.score(start_removed_nodes)) - assert np.isclose(op_delta, bic.local_score(res, added_arc[1], [added_arc[0]]) - - bic.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - - bic.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc[1], [added_arc[0]]) + - bic.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - bic.local_score(res, added_arc_removed[1], []), + ) res = hc.estimate(arc_set, bic, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() res = hc.estimate(arc_set, bic, start, verbose=False) - assert all(map(lambda arc : not res.is_interface(arc[1]), res.arcs())) + assert all(map(lambda arc: not res.is_interface(arc[1]), res.arcs())) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, verbose=False) - assert all(map(lambda arc : not res_removed.is_interface(arc[1]), res_removed.arcs())) + assert all( + map(lambda arc: not res_removed.is_interface(arc[1]), res_removed.arcs()) + ) + def test_hc_estimate_validation(): column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) - column_names.insert(1, 'e') - column_names.insert(4, 'f') + column_names.insert(1, "E") + column_names.insert(4, "F") start_removed_nodes = pbn.GaussianNetwork(column_names) - start_removed_nodes.remove_node('e') - start_removed_nodes.remove_node('f') - + start_removed_nodes.remove_node("E") + start_removed_nodes.remove_node("F") + vl = pbn.ValidatedLikelihood(df) arc_set = pbn.ArcOperatorSet() @@ -124,12 +198,20 @@ def test_hc_estimate_validation(): assert res_removed.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[::-1] - assert np.isclose(op_delta, vl.cv_lik.score(res_removed) - vl.cv_lik.score(start_removed_nodes)) - - assert np.isclose(op_delta, vl.cv_lik.local_score(res, added_arc[1], [added_arc[0]]) - - vl.cv_lik.local_score(res, added_arc[1], [])) - assert np.isclose(op_delta, vl.cv_lik.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - - vl.cv_lik.local_score(res, added_arc_removed[1], [])) + assert np.isclose( + op_delta, vl.cv_lik.score(res_removed) - vl.cv_lik.score(start_removed_nodes) + ) + + assert np.isclose( + op_delta, + vl.cv_lik.local_score(res, added_arc[1], [added_arc[0]]) + - vl.cv_lik.local_score(res, added_arc[1], []), + ) + assert np.isclose( + op_delta, + vl.cv_lik.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) + - vl.cv_lik.local_score(res, added_arc_removed[1], []), + ) # CV is score equivalent for GBNs, so if we blacklist the added_edge, its reverse will be added. res = hc.estimate(arc_set, vl, start, max_iters=1, arc_blacklist=[added_arc]) @@ -137,65 +219,39 @@ def test_hc_estimate_validation(): reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc - res_removed = hc.estimate(arc_set, vl, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) + res_removed = hc.estimate( + arc_set, vl, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed] + ) assert res_removed.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert reversed_arc == reversed_arc_removed - + res = hc.estimate(arc_set, vl, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() - res_removed = hc.estimate(arc_set, vl, start_removed_nodes, epsilon=(op_delta + 0.01)) + res_removed = hc.estimate( + arc_set, vl, start_removed_nodes, epsilon=(op_delta + 0.01) + ) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() - # Can't compare models because the arcs could be oriented in different direction, + # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, vl, start, verbose=False) res_removed = hc.estimate(arc_set, vl, start_removed_nodes, verbose=False) + def test_hc_shortcut_function(): model = pbn.hc(df, bn_type=pbn.GaussianNetworkType()) assert type(model) == pbn.GaussianNetwork - model = pbn.hc(df, bn_type=MyRestrictedGaussianNetworkType(), score="bic", operators=["arcs"]) + model = pbn.hc( + df, bn_type=MyRestrictedGaussianNetworkType(), score="bic", operators=["arcs"] + ) assert type(model) == NewBN -class MyRestrictedGaussianNetworkType(BayesianNetworkType): - def __init__(self): - BayesianNetworkType.__init__(self) - - def is_homogeneous(self): - return True - - def default_node_type(self): - return pbn.LinearGaussianCPDType() - - def can_have_arc(self, model, source, target): - return "a" in source - - def new_bn(self, nodes): - return NewBN(nodes) - - def __str__(self): - return "MyRestrictedGaussianNetworkType" - -class NewBN(BayesianNetwork): - def __init__(self, variables, arcs=None): - if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) - else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) - - self.extra_data = "extra" - - def __getstate_extra__(self): - return self.extra_data - - def __setstate_extra__(self, extra): - self.extra_data = extra def test_newbn_estimate_validation(): - start = NewBN(["a", "b", "c", "d"]) + start = NewBN(["A", "B", "C", "D"]) hc = pbn.GreedyHillClimbing() arc = pbn.ArcOperatorSet() bic = pbn.BIC(df) @@ -203,4 +259,52 @@ def test_newbn_estimate_validation(): estimated = hc.estimate(arc, bic, start) assert type(start) == type(estimated) - assert estimated.extra_data == "extra" \ No newline at end of file + assert estimated.extra_data == "extra" + + +# TODO: Test for when one variable has 0 variance in k-fold cross-validation for CKDEType +# # NOTE: Deprecated test for PyBNesian with full covariance matrices +# def test_hc_arc_singular_covariance(): +# """Function to test if with the GBN, KDE and SPBN, the HC algorithm raises an exception when the covariance matrix is singular. Then we check if the learnt model is valid.""" +# column_names = list(dep_df.columns.values) +# # GBN +# gbn = pbn.GaussianNetwork(nodes=column_names) +# gbn = pbn.hc( +# dep_df, +# start=gbn, +# max_iters=int(1e4), +# verbose=True, +# ) +# gbn.fit(dep_df) +# assert gbn.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(gbn.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{gbn.cpd(c)}") + +# # KDE +# kde = pbn.KDENetwork(nodes=column_names) +# kde = pbn.hc( +# dep_df, +# start=kde, +# max_iters=int(1e4), +# verbose=True, +# ) +# kde.fit(dep_df) +# assert kde.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(kde.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{kde.cpd(c)}") + +# # SPBN +# spbn = pbn.SemiparametricBN(nodes=column_names) +# spbn = pbn.hc( +# dep_df, +# start=spbn, +# max_iters=int(1e4), +# verbose=True, +# ) +# spbn.fit(dep_df) +# assert spbn.num_arcs() == 0 +# assert np.count_nonzero(np.isnan(spbn.logl(dep_df))) == 0 +# for c in column_names: +# print(f"{spbn.cpd(c)}") diff --git a/tests/learning/independence_tests/independence_test.py b/tests/learning/independence_tests/independence_test.py new file mode 100644 index 00000000..185973bf --- /dev/null +++ b/tests/learning/independence_tests/independence_test.py @@ -0,0 +1,155 @@ +import itertools + +import numpy as np +import pandas as pd +import pybnesian as pbn +from helpers.data import ( + DATA_SIZE, + N_NEIGHBORS, + SEED, + generate_discrete_data, + generate_discrete_data_independent, + generate_normal_data, + generate_normal_data_independent, +) +from scipy.stats import pearsonr + +# from sklearn.feature_selection import mutual_info_regression + +data = generate_normal_data(DATA_SIZE, SEED) +independent_data = generate_normal_data_independent(DATA_SIZE, SEED) + +discrete_data = generate_discrete_data(DATA_SIZE, SEED) +independent_discrete_data = generate_discrete_data_independent(DATA_SIZE, SEED) + +# INDEPENDENCE TESTS +# The null hypothesis (H0​) is that the two variables are independent, +# while the alternative hypothesis (H1) is that the two variables are dependent +# +# - If the p-value is less than or equal to the chosen significance level (usually 0.05), +# you reject the null hypothesis (H0H) in favor of the alternative hypothesis (H1). +# This suggests that there is a statistically significant association between the two variables. +# +# - If the p-value is greater than the significance level, you do not reject the null hypothesis. +# This indicates that there is insufficient evidence to conclude that the variables are dependent, +# and it is plausible that they are independent + + +def test_chi_square(): + """Test the chi-square independence test with discrete data""" + chi_square = pbn.ChiSquare(discrete_data) + independent_chi_square = pbn.ChiSquare(independent_discrete_data) + + p_value = chi_square.pvalue("A", "B") + independent_p_value = independent_chi_square.pvalue("A", "B") + + # Check whether the p-values are below the significance level + assert p_value < 0.05 + assert independent_p_value > 0.05 + + +# RFE: Test true and false independence +def test_linear_correlation(): + """Test the linear correlation independence test with normal data""" + df = data[["A", "B"]] + independent_df = independent_data[["A", "B"]] + + # Pybnesian Linear correlation + linear_correlation = pbn.LinearCorrelation(df) + independent_linear_correlation = pbn.LinearCorrelation(independent_df) + pvalue = linear_correlation.pvalue("A", "B") + independent_pvalue = independent_linear_correlation.pvalue("A", "B") + + # scipy pearsonr correlation + correlations = {} + columns = df.columns.tolist() + for col_a, col_b in itertools.combinations(columns, 2): + correlations[col_a + "__" + col_b] = pearsonr( + df.loc[:, col_a], df.loc[:, col_b] + ) + result = pd.DataFrame.from_dict( + correlations, orient="index", columns=["PCC", "p-value"] + ) + + # Compare correlation values + np.testing.assert_allclose( + np.array([df.corr().loc["A", "B"]]), + np.array([result.loc["A__B", "PCC"]]), + rtol=1e-5, + atol=1e-8, + ) + # Compare p-values + np.testing.assert_allclose( + np.array([pvalue]), + np.array([result.loc["A__B", "p-value"]]), + rtol=1e-5, + atol=1e-8, + ) + + # Check whether the p-values are below the significance level + assert pvalue < 0.05 + assert independent_pvalue > 0.05 + + +def test_mutual_info(): + """Test the mutual information independence test with normal data""" + mutual_info = pbn.MutualInformation(data) + independent_mutual_info = pbn.MutualInformation(independent_data) + + # Check whether the mutual information is higher when the variables are dependent + mutual_info_value = mutual_info.mi("A", "B") + independent_mutual_info_value = independent_mutual_info.mi("A", "B") + assert mutual_info_value > independent_mutual_info_value + + # Check whether the p-values are below the significance level + pvalue = mutual_info.pvalue("A", "B") + independent_pvalue = independent_mutual_info.pvalue("A", "B") + assert pvalue < 0.05 + assert independent_pvalue > 0.05 + + +def test_k_mutual_info(): + """Test the k-nearest neighbors mutual information independence test with normal data""" + k_mutual_info = pbn.KMutualInformation(data, k=N_NEIGHBORS) + independent_k_mutual_info = pbn.KMutualInformation(independent_data, k=N_NEIGHBORS) + + # Check whether the mutual information is higher when the variables are dependent + k_mutual_info_value = k_mutual_info.mi("A", "B") + independent_k_mutual_info_value = independent_k_mutual_info.mi("A", "B") + assert k_mutual_info_value > independent_k_mutual_info_value + + # Check whether the p-values are below the significance level + # NOTE: Slow execution + pvalue = k_mutual_info.pvalue("A", "B") + independent_pvalue = independent_k_mutual_info.pvalue("A", "B") + assert pvalue < 0.05 + assert independent_pvalue > 0.05 + + # RFE: Results vary with scikit-learn, why? + + # sklearn_k_mutual_info_value = mutual_info_regression( + # data[["A"]], data["B"], n_neighbors=n_neighbors + # )[0] + # print(k_mutual_info_value) + # print("\n", sklearn_k_mutual_info_value) + # np.testing.assert_allclose( + # sklearn_k_mutual_info_value, + # np.array([k_mutual_info_value]), + # rtol=1e-5, + # atol=1e-8, + # ) + # RFE: Test alternative https://github.com/syanga/pycit + + +def test_rcot(): + """Test the Randomized Conditional Correlation Test (pbn.RCoT) independence test with normal data""" + rcot = pbn.RCoT(data, random_fourier_xy=5, random_fourier_z=100) + independent_rcot = pbn.RCoT( + independent_data, random_fourier_xy=5, random_fourier_z=100 + ) + p_value = rcot.pvalue("A", "B") + independent_p_value = independent_rcot.pvalue("A", "B") + + # Check whether the p-values are below the significance level + assert p_value < 0.05 + assert independent_p_value > 0.05 diff --git a/tests/learning/operators/operatorpool_test.py b/tests/learning/operators/operatorpool_test.py index 60f05f71..c2184689 100644 --- a/tests/learning/operators/operatorpool_test.py +++ b/tests/learning/operators/operatorpool_test.py @@ -1,29 +1,32 @@ -import pytest import pybnesian as pbn -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_normal_data + +df = generate_normal_data(DATA_SIZE) -SIZE = 10000 -df = util_test.generate_normal_data(SIZE) def test_create(): arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() pool = pbn.OperatorPool([arcs, node_type]) + # Checks if pool is created + assert pool is not None with pytest.raises(ValueError) as ex: - pool = pbn.OperatorPool([]) + pbn.OperatorPool([]) assert "cannot be empty" in str(ex.value) + def test_find_max(): - spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) cv = pbn.CVLikelihood(df) arcs = pbn.ArcOperatorSet() node_type = pbn.ChangeNodeTypeSet() - + arcs.cache_scores(spbn, cv) spbn.set_unknown_node_types(df) node_type.cache_scores(spbn, cv) - + arcs_max = arcs.find_max(spbn) node_max = node_type.find_max(spbn) @@ -36,5 +39,3 @@ def test_find_max(): assert op_combined == arcs_max else: assert op_combined == node_max - - \ No newline at end of file diff --git a/tests/learning/operators/operators_test.py b/tests/learning/operators/operators_test.py index f0c7cf5e..cc634e48 100644 --- a/tests/learning/operators/operators_test.py +++ b/tests/learning/operators/operators_test.py @@ -1,105 +1,108 @@ -import pytest import pybnesian as pbn +import pytest + def test_create(): - o = pbn.AddArc("a", "b", 1) - assert o.source() == 'a' - assert o.target() == 'b' + o = pbn.AddArc("A", "B", 1) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 1 - o = pbn.RemoveArc("a", "b", 2) - assert o.source() == 'a' - assert o.target() == 'b' + o = pbn.RemoveArc("A", "B", 2) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 2 - o = pbn.FlipArc("a", "b", 3) - assert o.source() == 'a' - assert o.target() == 'b' + o = pbn.FlipArc("A", "B", 3) + assert o.source() == "A" + assert o.target() == "B" assert o.delta() == 3 - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 4) - assert o.node() == 'a' + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 4) + assert o.node() == "A" assert o.node_type() == pbn.CKDEType() assert o.delta() == 4 + def test_apply(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_arcs() == 0 - assert not gbn.has_arc('a', 'b') + assert not gbn.has_arc("A", "B") - o = pbn.AddArc("a", "b", 1) + o = pbn.AddArc("A", "B", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert gbn.has_arc('a', 'b') - - o = pbn.FlipArc("a", "b", 1) + assert gbn.has_arc("A", "B") + + o = pbn.FlipArc("A", "B", 1) o.apply(gbn) assert gbn.num_arcs() == 1 - assert not gbn.has_arc('a', 'b') - assert gbn.has_arc('b', 'a') + assert not gbn.has_arc("A", "B") + assert gbn.has_arc("B", "A") - o = pbn.RemoveArc("b", "a", 1) + o = pbn.RemoveArc("B", "A", 1) o.apply(gbn) assert gbn.num_arcs() == 0 - assert not gbn.has_arc('b', 'a') + assert not gbn.has_arc("B", "A") - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) with pytest.raises(ValueError) as ex: o.apply(gbn) assert "Wrong factor type" in str(ex.value) - spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_arcs() == 0 - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) - assert(spbn.node_type('a') == pbn.UnknownFactorType()) + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) + assert spbn.node_type("A") == pbn.UnknownFactorType() o.apply(spbn) - assert(spbn.node_type('a') == pbn.CKDEType()) + assert spbn.node_type("A") == pbn.CKDEType() - assert not spbn.has_arc('a', 'b') - o = pbn.AddArc("a", "b", 1) + assert not spbn.has_arc("A", "B") + o = pbn.AddArc("A", "B", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert spbn.has_arc('a', 'b') - - o = pbn.FlipArc("a", "b", 1) + assert spbn.has_arc("A", "B") + + o = pbn.FlipArc("A", "B", 1) o.apply(spbn) assert spbn.num_arcs() == 1 - assert not spbn.has_arc('a', 'b') - assert spbn.has_arc('b', 'a') + assert not spbn.has_arc("A", "B") + assert spbn.has_arc("B", "A") - o = pbn.RemoveArc("b", "a", 1) + o = pbn.RemoveArc("B", "A", 1) o.apply(spbn) assert spbn.num_arcs() == 0 - assert not spbn.has_arc('b', 'a') + assert not spbn.has_arc("B", "A") + def test_opposite(): - bn = pbn.SemiparametricBN(["a", "b"]) - o = pbn.AddArc("a", "b", 1) + bn = pbn.SemiparametricBN(["A", "B"]) + o = pbn.AddArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == 'a' - assert oppo.target() == 'b' + assert oppo.source() == "A" + assert oppo.target() == "B" assert oppo.delta() == -1 assert type(oppo) == pbn.RemoveArc - o = pbn.RemoveArc("a", "b", 1) + o = pbn.RemoveArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == 'a' - assert oppo.target() == 'b' + assert oppo.source() == "A" + assert oppo.target() == "B" assert oppo.delta() == -1 assert type(oppo) == pbn.AddArc - o = pbn.FlipArc("a", "b", 1) + o = pbn.FlipArc("A", "B", 1) oppo = o.opposite(bn) - assert oppo.source() == 'b' - assert oppo.target() == 'a' + assert oppo.source() == "B" + assert oppo.target() == "A" assert oppo.delta() == -1 assert type(oppo) == pbn.FlipArc - bn.set_node_type("a", pbn.LinearGaussianCPDType()) - o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) + bn.set_node_type("A", pbn.LinearGaussianCPDType()) + o = pbn.ChangeNodeType("A", pbn.CKDEType(), 1) oppo = o.opposite(bn) - assert oppo.node() == 'a' + assert oppo.node() == "A" assert oppo.node_type() == pbn.LinearGaussianCPDType() assert oppo.delta() == -1 - assert type(oppo) == pbn.ChangeNodeType \ No newline at end of file + assert type(oppo) == pbn.ChangeNodeType diff --git a/tests/learning/operators/operatorset_test.py b/tests/learning/operators/operatorset_test.py index a29a3af2..5deb682a 100644 --- a/tests/learning/operators/operatorset_test.py +++ b/tests/learning/operators/operatorset_test.py @@ -1,14 +1,14 @@ -import pytest import numpy as np import pybnesian as pbn -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_normal_data + +df = generate_normal_data(DATA_SIZE) -SIZE = 10000 -df = util_test.generate_normal_data(SIZE) def test_create_change_node(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) - + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) + cv = pbn.CVLikelihood(df) node_op = pbn.ChangeNodeTypeSet() @@ -17,25 +17,26 @@ def test_create_change_node(): node_op.cache_scores(gbn, cv) assert "can only be used with non-homogeneous" in str(ex.value) + def test_lists(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() - arc_op.set_arc_blacklist([("b", "a")]) - arc_op.set_arc_whitelist([("b", "c")]) + arc_op.set_arc_blacklist([("B", "A")]) + arc_op.set_arc_whitelist([("B", "C")]) arc_op.set_max_indegree(3) - arc_op.set_type_whitelist([("a", pbn.LinearGaussianCPDType())]) + arc_op.set_type_whitelist([("A", pbn.LinearGaussianCPDType())]) arc_op.cache_scores(gbn, bic) - arc_op.set_arc_blacklist([("e", "a")]) + arc_op.set_arc_blacklist([("E", "A")]) with pytest.raises(ValueError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value) - arc_op.set_arc_whitelist([("e", "a")]) + arc_op.set_arc_whitelist([("E", "A")]) with pytest.raises(ValueError) as ex: arc_op.cache_scores(gbn, bic) @@ -43,7 +44,7 @@ def test_lists(): def test_check_max_score(): - gbn = pbn.GaussianNetwork(['c', 'd']) + gbn = pbn.GaussianNetwork(["C", "D"]) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() @@ -51,28 +52,28 @@ def test_check_max_score(): arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) - assert np.isclose(op.delta(), (bic.local_score(gbn, 'd', ['c']) - bic.local_score(gbn, 'd'))) + assert np.isclose( + op.delta(), (bic.local_score(gbn, "D", ["C"]) - bic.local_score(gbn, "D")) + ) # BIC is decomposable so the best operation is the arc in reverse direction. arc_op.set_arc_blacklist([(op.source(), op.target())]) arc_op.cache_scores(gbn, bic) - + op2 = arc_op.find_max(gbn) assert op.source() == op2.target() assert op.target() == op2.source() assert (type(op) == type(op2)) and (type(op) == pbn.AddArc) + def test_nomax(): - gbn = pbn.GaussianNetwork(['a', 'b']) + gbn = pbn.GaussianNetwork(["A", "B"]) bic = pbn.BIC(df) - arc_op = pbn.ArcOperatorSet(whitelist=[("a", "b")]) + arc_op = pbn.ArcOperatorSet(whitelist=[("A", "B")]) arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) assert op is None - - - diff --git a/tests/learning/operators/operatorstabuset_test.py b/tests/learning/operators/operatorstabuset_test.py index eae3ade1..b91daca1 100644 --- a/tests/learning/operators/operatorstabuset_test.py +++ b/tests/learning/operators/operatorstabuset_test.py @@ -1,24 +1,23 @@ import pybnesian as pbn + def test_OperatorTabuSet(): tabu_set = pbn.OperatorTabuSet() assert tabu_set.empty() - assert not tabu_set.contains(pbn.AddArc("a", "b", 1)) - tabu_set.insert(pbn.AddArc("a", "b", 2)) + assert not tabu_set.contains(pbn.AddArc("A", "B", 1)) + tabu_set.insert(pbn.AddArc("A", "B", 2)) assert not tabu_set.empty() - assert tabu_set.contains(pbn.AddArc("a", "b", 3)) + assert tabu_set.contains(pbn.AddArc("A", "B", 3)) - assert not tabu_set.contains(pbn.RemoveArc("b", "c", 4)) - tabu_set.insert(pbn.RemoveArc("b", "c", 5)) - assert tabu_set.contains(pbn.RemoveArc("b", "c", 6)) + assert not tabu_set.contains(pbn.RemoveArc("B", "C", 4)) + tabu_set.insert(pbn.RemoveArc("B", "C", 5)) + assert tabu_set.contains(pbn.RemoveArc("B", "C", 6)) - assert not tabu_set.contains(pbn.FlipArc("c", "d", 7)) - tabu_set.insert(pbn.RemoveArc("c", "d", 8)) - assert tabu_set.contains(pbn.RemoveArc("c", "d", 9)) + assert not tabu_set.contains(pbn.FlipArc("C", "D", 7)) + tabu_set.insert(pbn.RemoveArc("C", "D", 8)) + assert tabu_set.contains(pbn.RemoveArc("C", "D", 9)) tabu_set.clear() assert tabu_set.empty() - - diff --git a/tests/learning/parameters/mle_test.py b/tests/learning/parameters/mle_test.py index 37699de6..b676e813 100644 --- a/tests/learning/parameters/mle_test.py +++ b/tests/learning/parameters/mle_test.py @@ -1,10 +1,9 @@ -import pytest import numpy as np import pybnesian as pbn -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_normal_data -SIZE = 10000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(DATA_SIZE) def numpy_fit_mle_lg(data, variable, evidence): @@ -20,37 +19,42 @@ def numpy_fit_mle_lg(data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) return beta, var + def test_mle_create(): with pytest.raises(ValueError) as ex: - mle = pbn.MLE(pbn.CKDEType()) + pbn.MLE(pbn.CKDEType()) assert "MLE not available" in str(ex.value) mle = pbn.MLE(pbn.LinearGaussianCPDType()) + assert mle is not None + def test_mle_lg(): mle = pbn.MLE(pbn.LinearGaussianCPDType()) - p = mle.estimate(df, "a", []) - np_beta, np_var = numpy_fit_mle_lg(df, "a", []) + p = mle.estimate(df, "A", []) + np_beta, np_var = numpy_fit_mle_lg(df, "A", []) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "b", ["a"]) - np_beta, np_var = numpy_fit_mle_lg(df, "b", ["a"]) + p = mle.estimate(df, "B", ["A"]) + np_beta, np_var = numpy_fit_mle_lg(df, "B", ["A"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "c", ["a", "b"]) - np_beta, np_var = numpy_fit_mle_lg(df, "c", ["a", "b"]) + p = mle.estimate(df, "C", ["A", "B"]) + np_beta, np_var = numpy_fit_mle_lg(df, "C", ["A", "B"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) - p = mle.estimate(df, "d", ["a", "b", "c"]) - np_beta, np_var = numpy_fit_mle_lg(df, "d", ["a", "b", "c"]) + p = mle.estimate(df, "D", ["A", "B", "C"]) + np_beta, np_var = numpy_fit_mle_lg(df, "D", ["A", "B", "C"]) assert np.all(np.isclose(p.beta, np_beta)) - assert np.isclose(p.variance, np_var) \ No newline at end of file + assert np.isclose(p.variance, np_var) diff --git a/tests/learning/scores/bic_test.py b/tests/learning/scores/bic_test.py index 77bd4060..49f9d689 100644 --- a/tests/learning/scores/bic_test.py +++ b/tests/learning/scores/bic_test.py @@ -1,11 +1,10 @@ import numpy as np -from scipy.stats import norm import pybnesian as pbn -import util_test +from helpers.data import DATA_SIZE, generate_normal_data +from scipy.stats import norm -SIZE = 10000 +df = generate_normal_data(DATA_SIZE) -df = util_test.generate_normal_data(SIZE) def numpy_local_score(data, variable, evidence): if isinstance(variable, str): @@ -20,66 +19,105 @@ def numpy_local_score(data, variable, evidence): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*evidence_data, axis=1) + means = beta[0] + np.sum(beta[1:] * evidence_data, axis=1) loglik = norm.logpdf(variable_data, means, np.sqrt(var)) return loglik.sum() - np.log(N) * 0.5 * (d + 2) + def test_bic_local_score(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + ) + bic = pbn.BIC(df) - - assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df, 'a', [])) - assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df, 'b', ['a'])) - assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df, 'c', ['a', 'b'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['a', 'b', 'c'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['b', 'c', 'a'])) - - assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) - assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) - assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) - assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose(bic.local_score(gbn, "A", []), numpy_local_score(df, "A", [])) + assert np.isclose( + bic.local_score(gbn, "B", ["A"]), numpy_local_score(df, "B", ["A"]) + ) + assert np.isclose( + bic.local_score(gbn, "C", ["A", "B"]), numpy_local_score(df, "C", ["A", "B"]) + ) + assert np.isclose( + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df, "D", ["A", "B", "C"]), + ) + assert np.isclose( + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df, "D", ["B", "C", "A"]), + ) + + assert bic.local_score(gbn, "A") == bic.local_score(gbn, "A", gbn.parents("A")) + assert bic.local_score(gbn, "B") == bic.local_score(gbn, "B", gbn.parents("B")) + assert bic.local_score(gbn, "C") == bic.local_score(gbn, "C", gbn.parents("C")) + assert bic.local_score(gbn, "D") == bic.local_score(gbn, "D", gbn.parents("D")) + def test_bic_local_score_null(): - gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + ) np.random.seed(0) - a_null = np.random.randint(0, SIZE, size=100) - b_null = np.random.randint(0, SIZE, size=100) - c_null = np.random.randint(0, SIZE, size=100) - d_null = np.random.randint(0, SIZE, size=100) + a_null = np.random.randint(0, DATA_SIZE, size=100) + b_null = np.random.randint(0, DATA_SIZE, size=100) + c_null = np.random.randint(0, DATA_SIZE, size=100) + d_null = np.random.randint(0, DATA_SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan - + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan + bic = pbn.BIC(df_null) - - assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df_null, 'a', [])) - assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df_null, 'b', ['a'])) - assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df_null, 'c', ['a', 'b'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['b', 'c', 'a'])) - - assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) - assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) - assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) - assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose( + bic.local_score(gbn, "A", []), numpy_local_score(df_null, "A", []) + ) + assert np.isclose( + bic.local_score(gbn, "B", ["A"]), numpy_local_score(df_null, "B", ["A"]) + ) + assert np.isclose( + bic.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(df_null, "C", ["A", "B"]), + ) + assert np.isclose( + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df_null, "D", ["A", "B", "C"]), + ) + assert np.isclose( + bic.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(df_null, "D", ["B", "C", "A"]), + ) + + assert bic.local_score(gbn, "A") == bic.local_score(gbn, "A", gbn.parents("A")) + assert bic.local_score(gbn, "B") == bic.local_score(gbn, "B", gbn.parents("B")) + assert bic.local_score(gbn, "C") == bic.local_score(gbn, "C", gbn.parents("C")) + assert bic.local_score(gbn, "D") == bic.local_score(gbn, "D", gbn.parents("D")) + def test_bic_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - - bic = pbn.BIC(df) - - assert np.isclose(bic.score(gbn), (bic.local_score(gbn, 'a', []) + - bic.local_score(gbn, 'b', ['a']) + - bic.local_score(gbn, 'c', ['a', 'b']) + - bic.local_score(gbn, 'd', ['a', 'b', 'c']))) + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + bic = pbn.BIC(df) + assert np.isclose( + bic.score(gbn), + ( + bic.local_score(gbn, "A", []) + + bic.local_score(gbn, "B", ["A"]) + + bic.local_score(gbn, "C", ["A", "B"]) + + bic.local_score(gbn, "D", ["A", "B", "C"]) + ), + ) diff --git a/tests/learning/scores/cvlikelihood_test.py b/tests/learning/scores/cvlikelihood_test.py index 95c922c5..4164b836 100644 --- a/tests/learning/scores/cvlikelihood_test.py +++ b/tests/learning/scores/cvlikelihood_test.py @@ -1,96 +1,124 @@ -import pytest import numpy as np -from scipy.stats import norm, gaussian_kde +import pandas as pd import pybnesian as pbn -import util_test +import pytest +from helpers.data import generate_normal_data +from scipy.stats import gaussian_kde, norm SIZE = 1000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) seed = 0 -def numpy_local_score(node_type, data, variable, evidence): + +def numpy_local_score( + node_type: pbn.FactorType, data: pd.DataFrame, variable: str, evidence: list[str] +): cv = pbn.CrossValidation(data, 10, seed) loglik = 0 for train_df, test_df in cv: - if isinstance(variable, str): - node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() - variable_data = node_data.loc[:, variable] - evidence_data = node_data.loc[:, evidence] - test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.loc[:, variable] - test_evidence_data = test_node_data.loc[:, evidence] - else: - node_data = train_df.to_pandas().iloc[:, [variable] + evidence].dropna() - variable_data = node_data.iloc[:, 0] - evidence_data = node_data.iloc[:, 1:] - test_node_data = test_df.to_pandas().iloc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.iloc[:, 0] - test_evidence_data = test_node_data.iloc[:, 1:] + node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() + variable_data = node_data.loc[:, variable] + evidence_data = node_data.loc[:, evidence] + test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() + test_variable_data = test_node_data.loc[:, variable] + test_evidence_data = test_node_data.loc[:, evidence] if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*test_evidence_data, axis=1) + means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) loglik += norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + elif node_type == pbn.CKDEType(): - k_joint = gaussian_kde(node_data.to_numpy().T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + k_joint = gaussian_kde( + node_data.to_numpy().T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) if evidence: - k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) - loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) + k_marg = gaussian_kde( + evidence_data.to_numpy().T, bw_method=k_joint.factor + ) + loglik += np.sum( + k_joint.logpdf(test_node_data.to_numpy().T) + - k_marg.logpdf(test_evidence_data.to_numpy().T) + ) else: loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) return loglik + def test_cvl_create(): s = pbn.CVLikelihood(df) assert len(list(s.cv)) == 10 s = pbn.CVLikelihood(df, 5) assert len(list(s.cv)) == 5 - + s = pbn.CVLikelihood(df, 10, 0) assert len(list(s.cv)) == 10 s2 = pbn.CVLikelihood(df, 10, 0) assert len(list(s2.cv)) == 10 for (train_cv, test_cv), (train_cv2, test_cv2) in zip(s.cv, s2.cv): - assert train_cv.equals(train_cv2), "Train CV DataFrames with the same seed are not equal." - assert test_cv.equals(test_cv2), "Test CV DataFrames with the same seed are not equal." + assert train_cv.equals( + train_cv2 + ), "Train CV DataFrames with the same seed are not equal." + assert test_cv.equals( + test_cv2 + ), "Test CV DataFrames with the same seed are not equal." with pytest.raises(ValueError) as ex: - s = pbn.CVLikelihood(df, SIZE+1) + s = pbn.CVLikelihood(df, SIZE + 1) assert "Cannot split" in str(ex.value) + def test_cvl_local_score_gbn(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + cvl = pbn.CVLikelihood(df, 10, seed) - - assert np.isclose(cvl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) - assert np.isclose(cvl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) - assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) - assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) - assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d')) + + assert np.isclose( + cvl.local_score(gbn, "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "A", []), + ) + assert np.isclose( + cvl.local_score(gbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score(gbn, "D", ["A", "B", "C"]), + cvl.local_score(gbn, "D", ["B", "C", "A"]), + ) + + assert cvl.local_score(gbn, "A") == cvl.local_score(gbn, "A", gbn.parents("A")) + assert cvl.local_score(gbn, "B") == cvl.local_score(gbn, "B", gbn.parents("B")) + assert cvl.local_score(gbn, "C") == cvl.local_score(gbn, "C", gbn.parents("C")) + assert cvl.local_score(gbn, "D") == cvl.local_score(gbn, "D", gbn.parents("D")) + def test_cvl_local_score_gbn_null(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -98,67 +126,101 @@ def test_cvl_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) - assert np.isclose(cvl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), - cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) - assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) - assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) - assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + cvl.local_score(gbn, "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "A", []), + ) + assert np.isclose( + cvl.local_score(gbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score(gbn, "D", ["A", "B", "C"]), + cvl.local_score(gbn, "D", ["B", "C", "A"]), + ) + + assert cvl.local_score(gbn, "A") == cvl.local_score(gbn, "A", gbn.parents("A")) + assert cvl.local_score(gbn, "B") == cvl.local_score(gbn, "B", gbn.parents("B")) + assert cvl.local_score(gbn, "C") == cvl.local_score(gbn, "C", gbn.parents("C")) + assert cvl.local_score(gbn, "D") == cvl.local_score(gbn, "D", gbn.parents("D")) + def test_cvl_local_score_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) + cvl = pbn.CVLikelihood(df, 10, seed) - assert np.isclose(cvl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), df, 'a', [])) - assert np.isclose(cvl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) - assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) - assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) - assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) - - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), - numpy_local_score(pbn.CKDEType(), df, 'b', ['a'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df, 'd', ['b', 'c', 'a'])) + assert np.isclose( + cvl.local_score(spbn, "A", []), numpy_local_score(pbn.CKDEType(), df, "A", []) + ) + assert np.isclose( + cvl.local_score(spbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score(pbn.CKDEType(), df, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "D", ["B", "C", "A"]), + ) + + assert cvl.local_score(spbn, "A") == cvl.local_score(spbn, "A", spbn.parents("A")) + assert cvl.local_score(spbn, "B") == cvl.local_score(spbn, "B", spbn.parents("B")) + assert cvl.local_score(spbn, "C") == cvl.local_score(spbn, "C", spbn.parents("C")) + assert cvl.local_score(spbn, "D") == cvl.local_score(spbn, "D", spbn.parents("D")) + + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "A", []), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "B", ["A"]), + numpy_local_score(pbn.CKDEType(), df, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df, "D", ["B", "C", "A"]), + ) def test_cvl_local_score_null_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -166,56 +228,89 @@ def test_cvl_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) - assert np.isclose(cvl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['b', 'c', 'a'])) - - assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) - assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) - assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) - assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) - - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), - numpy_local_score(pbn.CKDEType(), df_null, 'b', ['a'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df_null, 'd', ['a', 'b', 'c'])) - assert np.isclose(cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.CKDEType(), df_null, 'd', ['b', 'c', 'a'])) + assert np.isclose( + cvl.local_score(spbn, "A", []), + numpy_local_score(pbn.CKDEType(), df_null, "A", []), + ) + assert np.isclose( + cvl.local_score(spbn, "B", ["A"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score(pbn.CKDEType(), df_null, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "D", ["B", "C", "A"]), + ) + + assert cvl.local_score(spbn, "A") == cvl.local_score(spbn, "A", spbn.parents("A")) + assert cvl.local_score(spbn, "B") == cvl.local_score(spbn, "B", spbn.parents("B")) + assert cvl.local_score(spbn, "C") == cvl.local_score(spbn, "C", spbn.parents("C")) + assert cvl.local_score(spbn, "D") == cvl.local_score(spbn, "D", spbn.parents("D")) + + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "A", []), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "A", []), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "B", ["A"]), + numpy_local_score(pbn.CKDEType(), df_null, "B", ["A"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), "C", ["A", "B"]), + numpy_local_score(pbn.LinearGaussianCPDType(), df_null, "C", ["A", "B"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df_null, "D", ["A", "B", "C"]), + ) + assert np.isclose( + cvl.local_score_node_type(spbn, pbn.CKDEType(), "D", ["A", "B", "C"]), + numpy_local_score(pbn.CKDEType(), df_null, "D", ["B", "C", "A"]), + ) + def test_cvl_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) cv = pbn.CVLikelihood(df, 10, 0) - assert np.isclose(cv.score(gbn), ( - cv.local_score(gbn, 'a', []) + - cv.local_score(gbn, 'b', ['a']) + - cv.local_score(gbn, 'c', ['a', 'b']) + - cv.local_score(gbn, 'd', ['a', 'b', 'c']))) - - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - - assert np.isclose(cv.score(spbn), ( - cv.local_score(spbn, 'a') + - cv.local_score(spbn, 'b') + - cv.local_score(spbn, 'c') + - cv.local_score(spbn, 'd'))) \ No newline at end of file + assert np.isclose( + cv.score(gbn), + ( + cv.local_score(gbn, "A", []) + + cv.local_score(gbn, "B", ["A"]) + + cv.local_score(gbn, "C", ["A", "B"]) + + cv.local_score(gbn, "D", ["A", "B", "C"]) + ), + ) + + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) + + assert np.isclose( + cv.score(spbn), + ( + cv.local_score(spbn, "A") + + cv.local_score(spbn, "B") + + cv.local_score(spbn, "C") + + cv.local_score(spbn, "D") + ), + ) diff --git a/tests/learning/scores/holdoutlikelihood_test.py b/tests/learning/scores/holdoutlikelihood_test.py index 199889a3..b795898f 100644 --- a/tests/learning/scores/holdoutlikelihood_test.py +++ b/tests/learning/scores/holdoutlikelihood_test.py @@ -1,95 +1,149 @@ -import pytest import numpy as np -from scipy.stats import gaussian_kde, norm +import pandas as pd import pybnesian as pbn -import util_test +import pytest +from helpers.data import generate_normal_data +from scipy.stats import gaussian_kde, norm SIZE = 1000 -df = util_test.generate_normal_data(SIZE) +df = generate_normal_data(SIZE) seed = 0 -def numpy_local_score(node_type, training_data, test_data, variable, evidence): - if isinstance(variable, str): - node_data = training_data.loc[:, [variable] + evidence].dropna() - variable_data = node_data.loc[:, variable] - evidence_data = node_data.loc[:, evidence] - test_node_data = test_data.loc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.loc[:, variable] - test_evidence_data = test_node_data.loc[:, evidence] - else: - node_data = training_data.iloc[:, [variable] + evidence].dropna() - variable_data = node_data.iloc[:, 0] - evidence_data = node_data.iloc[:, 1:] - test_node_data = test_data.iloc[:, [variable] + evidence].dropna() - test_variable_data = test_node_data.iloc[:, 0] - test_evidence_data = test_node_data.iloc[:, 1:] +def numpy_local_score( + node_type: pbn.FactorType, + training_data: pd.DataFrame, + test_data: pd.DataFrame, + variable: str, + evidence: list[str], +): + + node_data = training_data.loc[:, [variable] + evidence].dropna() + variable_data = node_data.loc[:, variable] + evidence_data = node_data.loc[:, evidence] + test_node_data = test_data.loc[:, [variable] + evidence].dropna() + test_variable_data = test_node_data.loc[:, variable] + test_evidence_data = test_node_data.loc[:, evidence] + + loglik = 0 if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack((np.ones(N), evidence_data.to_numpy())) - (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) + (beta, res, _, _) = np.linalg.lstsq( + linregress_data, variable_data.to_numpy(), rcond=None + ) var = res / (N - d - 1) - means = beta[0] + np.sum(beta[1:]*test_evidence_data, axis=1) - return norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) + loglik = norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() + elif node_type == pbn.CKDEType(): - k_joint = gaussian_kde(node_data.to_numpy().T, - bw_method=lambda s : np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) + k_joint = gaussian_kde( + node_data.to_numpy().T, + bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) + * s.scotts_factor(), + ) if evidence: - k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) - return np.sum(k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) + k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.factor) + loglik = np.sum( + k_joint.logpdf(test_node_data.to_numpy().T) + - k_marg.logpdf(test_evidence_data.to_numpy().T) + ) else: - return np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) + loglik = np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) + + return loglik + def test_holdout_create(): + """Test HoldoutLikelihood creation with different parameters""" s = pbn.HoldoutLikelihood(df) assert s.training_data().num_rows == 0.8 * SIZE assert s.test_data().num_rows == 0.2 * SIZE - s = pbn.HoldoutLikelihood(df, 0.5) + s = pbn.HoldoutLikelihood(df, test_ratio=0.5) assert s.training_data().num_rows == 0.5 * SIZE assert s.test_data().num_rows == 0.5 * SIZE - - s = pbn.HoldoutLikelihood(df, 0.2, 0) - s2 = pbn.HoldoutLikelihood(df, 0.2, 0) + + s = pbn.HoldoutLikelihood(df, test_ratio=0.2, seed=0) + s2 = pbn.HoldoutLikelihood(df, test_ratio=0.2, seed=0) assert s.training_data().equals(s2.training_data()) assert s.test_data().equals(s2.test_data()) with pytest.raises(ValueError) as ex: - s = pbn.HoldoutLikelihood(df, 10, 0) - assert "test_ratio must be a number" in str(ex.value) + s = pbn.HoldoutLikelihood(df, test_ratio=10, seed=0) + assert "test_ratio must be a number" in str(ex.value) with pytest.raises(ValueError) as ex: - s = pbn.HoldoutLikelihood(df, 0, 0) - assert "test_ratio must be a number" in str(ex.value) + s = pbn.HoldoutLikelihood(df, test_ratio=0, seed=0) + assert "test_ratio must be a number" in str(ex.value) def test_holdout_local_score_gbn(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + hl = pbn.HoldoutLikelihood(df, 0.2, seed) - assert np.isclose(hl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - hl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) - assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) - assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) - assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + hl.local_score(gbn, "A", []), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "A", + [], + ), + ) + assert np.isclose( + hl.local_score(gbn, "B", ["A"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "B", + ["A"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "C", + ["A", "B"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["A", "B", "C"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "D", ["A", "B", "C"]), + hl.local_score(gbn, "D", ["B", "C", "A"]), + ) + + assert hl.local_score(gbn, "A") == hl.local_score(gbn, "A", gbn.parents("A")) + assert hl.local_score(gbn, "B") == hl.local_score(gbn, "B", gbn.parents("B")) + assert hl.local_score(gbn, "C") == hl.local_score(gbn, "C", gbn.parents("C")) + assert hl.local_score(gbn, "D") == hl.local_score(gbn, "D", gbn.parents("D")) + def test_holdout_local_score_gbn_null(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -97,55 +151,135 @@ def test_holdout_local_score_gbn_null(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) - assert np.isclose(hl.local_score(gbn, 'a', []), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(gbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(gbn, 'c', ['a', 'b']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), - hl.local_score(gbn, 'd', ['b', 'c', 'a'])) - - assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) - assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) - assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) - assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d')) + assert np.isclose( + hl.local_score(gbn, "A", []), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "A", + [], + ), + ) + assert np.isclose( + hl.local_score(gbn, "B", ["A"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "B", + ["A"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "C", ["A", "B"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "C", + ["A", "B"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["A", "B", "C"], + ), + ) + assert np.isclose( + hl.local_score(gbn, "D", ["A", "B", "C"]), + hl.local_score(gbn, "D", ["B", "C", "A"]), + ) + + assert hl.local_score(gbn, "A") == hl.local_score(gbn, "A", gbn.parents("A")) + assert hl.local_score(gbn, "B") == hl.local_score(gbn, "B", gbn.parents("B")) + assert hl.local_score(gbn, "C") == hl.local_score(gbn, "C", gbn.parents("C")) + assert hl.local_score(gbn, "D") == hl.local_score(gbn, "D", gbn.parents("D")) + def test_holdout_local_score_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) + hl = pbn.HoldoutLikelihood(df, 0.2, seed) - assert np.isclose(hl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) - - assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) - assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) - assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) - assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d')) + assert np.isclose( + hl.local_score(spbn, "A", []), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "A", + [], + ), + ) + assert np.isclose( + hl.local_score(spbn, "B", ["A"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "B", + ["A"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "C", + ["A", "B"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["A", "B", "C"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["B", "C", "A"], + ), + ) + + assert hl.local_score(spbn, "A") == hl.local_score(spbn, "A", spbn.parents("A")) + assert hl.local_score(spbn, "B") == hl.local_score(spbn, "B", spbn.parents("B")) + assert hl.local_score(spbn, "C") == hl.local_score(spbn, "C", spbn.parents("C")) + assert hl.local_score(spbn, "D") == hl.local_score(spbn, "D", spbn.parents("D")) + def test_holdout_local_score_null_spbn(): - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) - + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) + np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) @@ -153,45 +287,98 @@ def test_holdout_local_score_null_spbn(): d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() - df_null.loc[df_null.index[a_null], 'a'] = np.nan - df_null.loc[df_null.index[b_null], 'b'] = np.nan - df_null.loc[df_null.index[c_null], 'c'] = np.nan - df_null.loc[df_null.index[d_null], 'd'] = np.nan + df_null.loc[df_null.index[a_null], "A"] = np.nan + df_null.loc[df_null.index[b_null], "B"] = np.nan + df_null.loc[df_null.index[c_null], "C"] = np.nan + df_null.loc[df_null.index[d_null], "D"] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) - assert np.isclose(hl.local_score(spbn, 'a', []), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) - assert np.isclose(hl.local_score(spbn, 'b', ['a']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) - assert np.isclose(hl.local_score(spbn, 'c', ['a', 'b']), - numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) - assert np.isclose(hl.local_score(spbn, 'd', ['a', 'b', 'c']), - numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) - - assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) - assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) - assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) - assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d')) + assert np.isclose( + hl.local_score(spbn, "A", []), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "A", + [], + ), + ) + assert np.isclose( + hl.local_score(spbn, "B", ["A"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "B", + ["A"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "C", ["A", "B"]), + numpy_local_score( + pbn.CKDEType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "C", + ["A", "B"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["A", "B", "C"], + ), + ) + assert np.isclose( + hl.local_score(spbn, "D", ["A", "B", "C"]), + numpy_local_score( + pbn.LinearGaussianCPDType(), + hl.training_data().to_pandas(), + hl.test_data().to_pandas(), + "D", + ["B", "C", "A"], + ), + ) + + assert hl.local_score(spbn, "A") == hl.local_score(spbn, "A", spbn.parents("A")) + assert hl.local_score(spbn, "B") == hl.local_score(spbn, "B", spbn.parents("B")) + assert hl.local_score(spbn, "C") == hl.local_score(spbn, "C", spbn.parents("C")) + assert hl.local_score(spbn, "D") == hl.local_score(spbn, "D", spbn.parents("D")) + def test_holdout_score(): - gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) hl = pbn.HoldoutLikelihood(df, 0.2, 0) - assert np.isclose(hl.score(gbn), ( - hl.local_score(gbn, 'a', []) + - hl.local_score(gbn, 'b', ['a']) + - hl.local_score(gbn, 'c', ['a', 'b']) + - hl.local_score(gbn, 'd', ['a', 'b', 'c']))) + assert np.isclose( + hl.score(gbn), + ( + hl.local_score(gbn, "A", []) + + hl.local_score(gbn, "B", ["A"]) + + hl.local_score(gbn, "C", ["A", "B"]) + + hl.local_score(gbn, "D", ["A", "B", "C"]) + ), + ) - spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) - assert np.isclose(hl.score(spbn), ( - hl.local_score(spbn, 'a') + - hl.local_score(spbn, 'b') + - hl.local_score(spbn, 'c') + - hl.local_score(spbn, 'd'))) \ No newline at end of file + assert np.isclose( + hl.score(spbn), + ( + hl.local_score(spbn, "A") + + hl.local_score(spbn, "B") + + hl.local_score(spbn, "C") + + hl.local_score(spbn, "D") + ), + ) diff --git a/tests/models/BayesianNetwork_test.py b/tests/models/BayesianNetwork_test.py index e835245e..fec37dcf 100644 --- a/tests/models/BayesianNetwork_test.py +++ b/tests/models/BayesianNetwork_test.py @@ -1,225 +1,278 @@ -import pytest import numpy as np import pybnesian as pbn -from pybnesian import BayesianNetwork, GaussianNetwork -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_normal_data + +df = generate_normal_data(DATA_SIZE) -df = util_test.generate_normal_data(10000) -def test_create_bn(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) +def test_create_gaussian_bn(): + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 0 - assert gbn.nodes() == ['a', 'b', 'c', 'd'] + assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'c')]) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "C")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 1 - assert gbn.nodes() == ['a', 'b', 'c', 'd'] + assert gbn.nodes() == ["A", "B", "C", "D"] - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) assert gbn.num_nodes() == 4 assert gbn.num_arcs() == 3 - assert gbn.nodes() == ['a', 'c', 'b', 'd'] + assert gbn.nodes() == ["A", "C", "B", "D"] + + with pytest.raises(TypeError) as ex: + gbn = pbn.GaussianNetwork(["A", "B", "C"], [("A", "C", "B")]) + assert "incompatible constructor arguments" in str(ex.value) + + with pytest.raises(IndexError) as ex: + gbn = pbn.GaussianNetwork(["A", "B", "C"], [("A", "D")]) + assert "not present in the graph" in str(ex.value) + + with pytest.raises(ValueError) as ex: + gbn = pbn.GaussianNetwork([("A", "B"), ("B", "C"), ("C", "A")]) + assert "must be a DAG" in str(ex.value) + + with pytest.raises(ValueError) as ex: + gbn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] + ) + assert "must be a DAG" in str(ex.value) + + with pytest.raises(ValueError) as ex: + gbn = pbn.BayesianNetwork( + pbn.GaussianNetworkType(), ["A", "B", "C", "D"], [], [("A", pbn.CKDEType())] + ) + assert "Wrong factor type" in str(ex.value) + + +def test_create_discrete_bn(): + dbn = pbn.DiscreteBN(["A", "B", "C", "D"]) + + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 0 + assert dbn.nodes() == ["A", "B", "C", "D"] + + dbn = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "C")]) + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 1 + assert dbn.nodes() == ["A", "B", "C", "D"] + + dbn = pbn.DiscreteBN([("A", "C"), ("B", "D"), ("C", "D")]) + assert dbn.num_nodes() == 4 + assert dbn.num_arcs() == 3 + assert dbn.nodes() == ["A", "C", "B", "D"] with pytest.raises(TypeError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c'], [('a', 'c', 'b')]) + dbn = pbn.DiscreteBN(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c'], [('a', 'd')]) + dbn = pbn.DiscreteBN(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork([('a', 'b'), ('b', 'c'), ('c', 'a')]) + dbn = pbn.DiscreteBN([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c'), ('c', 'a')]) + dbn = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn = BayesianNetwork(pbn.GaussianNetworkType(), ['a', 'b', 'c', 'd'], [], [('a', pbn.CKDEType())]) + dbn = pbn.BayesianNetwork( + pbn.DiscreteBNType(), + ["A", "B", "C", "D"], + [], + [("A", pbn.CKDEType())], + ) assert "Wrong factor type" in str(ex.value) - -def gbn_generator(): - # Test different Networks created with different constructors. - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) - yield gbn - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) - yield gbn - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c')]) - yield gbn + def test_nodes_util(): + def gbn_generator(): + # Test different Networks created with different constructors. + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) + yield gbn + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) + yield gbn + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) + yield gbn + for gbn in gbn_generator(): assert gbn.num_nodes() == 4 nodes = gbn.nodes() indices = gbn.indices() - assert nodes[gbn.index('a')] == 'a' - assert nodes[gbn.index('b')] == 'b' - assert nodes[gbn.index('c')] == 'c' - assert nodes[gbn.index('d')] == 'd' + assert nodes[gbn.index("A")] == "A" + assert nodes[gbn.index("B")] == "B" + assert nodes[gbn.index("C")] == "C" + assert nodes[gbn.index("D")] == "D" assert indices[gbn.name(0)] == 0 assert indices[gbn.name(1)] == 1 assert indices[gbn.name(2)] == 2 assert indices[gbn.name(3)] == 3 - assert gbn.contains_node('a') - assert gbn.contains_node('b') - assert gbn.contains_node('c') - assert gbn.contains_node('d') - assert not gbn.contains_node('e') + assert gbn.contains_node("A") + assert gbn.contains_node("B") + assert gbn.contains_node("C") + assert gbn.contains_node("D") + assert not gbn.contains_node("E") + def test_parent_children(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 0 - assert gbn.num_parents('c') == 0 - assert gbn.num_parents('d') == 0 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == [] - assert gbn.parents('c') == [] - assert gbn.parents('d') == [] - - assert gbn.num_children('a') == 0 - assert gbn.num_children('b') == 0 - assert gbn.num_children('c') == 0 - assert gbn.num_children('d') == 0 - - gbn = GaussianNetwork([('a', 'c'), ('b', 'd'), ('c', 'd')]) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 0 - assert gbn.num_parents('c') == 1 - assert gbn.num_parents('d') == 2 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == [] - assert gbn.parents('c') == ['a'] - assert set(gbn.parents('d')) == set(['b', 'c']) - - assert gbn.num_children('a') == 1 - assert gbn.num_children('b') == 1 - assert gbn.num_children('c') == 1 - assert gbn.num_children('d') == 0 - - gbn = GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c')]) - - assert gbn.num_parents('a') == 0 - assert gbn.num_parents('b') == 1 - assert gbn.num_parents('c') == 1 - assert gbn.num_parents('d') == 0 - - assert gbn.parents('a') == [] - assert gbn.parents('b') == ['a'] - assert gbn.parents('c') == ['b'] - assert gbn.parents('d') == [] - - assert gbn.num_children('a') == 1 - assert gbn.num_children('b') == 1 - assert gbn.num_children('c') == 0 - assert gbn.num_children('d') == 0 + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) + + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 0 + assert gbn.num_parents("C") == 0 + assert gbn.num_parents("D") == 0 + + assert gbn.parents("A") == [] + assert gbn.parents("B") == [] + assert gbn.parents("C") == [] + assert gbn.parents("D") == [] + + assert gbn.num_children("A") == 0 + assert gbn.num_children("B") == 0 + assert gbn.num_children("C") == 0 + assert gbn.num_children("D") == 0 + + gbn = pbn.GaussianNetwork([("A", "C"), ("B", "D"), ("C", "D")]) + + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 0 + assert gbn.num_parents("C") == 1 + assert gbn.num_parents("D") == 2 + + assert gbn.parents("A") == [] + assert gbn.parents("B") == [] + assert gbn.parents("C") == ["A"] + assert set(gbn.parents("D")) == set(["B", "C"]) + + assert gbn.num_children("A") == 1 + assert gbn.num_children("B") == 1 + assert gbn.num_children("C") == 1 + assert gbn.num_children("D") == 0 + + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B"), ("B", "C")]) + + assert gbn.num_parents("A") == 0 + assert gbn.num_parents("B") == 1 + assert gbn.num_parents("C") == 1 + assert gbn.num_parents("D") == 0 + + assert gbn.parents("A") == [] + assert gbn.parents("B") == ["A"] + assert gbn.parents("C") == ["B"] + assert gbn.parents("D") == [] + + assert gbn.num_children("A") == 1 + assert gbn.num_children("B") == 1 + assert gbn.num_children("C") == 0 + assert gbn.num_children("D") == 0 + def test_arcs(): - gbn = GaussianNetwork(['a', 'b', 'c', 'd']) + gbn = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert gbn.num_arcs() == 0 assert gbn.arcs() == [] - assert not gbn.has_arc('a', 'b') + assert not gbn.has_arc("A", "B") - gbn.add_arc('a', 'b') + gbn.add_arc("A", "B") assert gbn.num_arcs() == 1 - assert gbn.arcs() == [('a', 'b')] - assert gbn.parents('b') == ['a'] - assert gbn.num_parents('b') == 1 - assert gbn.num_children('a') == 1 - assert gbn.has_arc('a', 'b') + assert gbn.arcs() == [("A", "B")] + assert gbn.parents("B") == ["A"] + assert gbn.num_parents("B") == 1 + assert gbn.num_children("A") == 1 + assert gbn.has_arc("A", "B") - gbn.add_arc('b', 'c') + gbn.add_arc("B", "C") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c')]) - assert gbn.parents('c') == ['b'] - assert gbn.num_parents('c') == 1 - assert gbn.num_children('b') == 1 - assert gbn.has_arc('b', 'c') - - gbn.add_arc('d', 'c') + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C")]) + assert gbn.parents("C") == ["B"] + assert gbn.num_parents("C") == 1 + assert gbn.num_children("B") == 1 + assert gbn.has_arc("B", "C") + + gbn.add_arc("D", "C") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c')]) - assert set(gbn.parents('c')) == set(['b', 'd']) - assert gbn.num_parents('c') == 2 - assert gbn.num_children('d') == 1 - assert gbn.has_arc('d', 'c') - - assert gbn.has_path('a', 'c') - assert not gbn.has_path('a', 'd') - assert gbn.has_path('b', 'c') - assert gbn.has_path('d', 'c') - - assert not gbn.can_add_arc('c', 'a') - # This edge exists, but virtually we consider that the addition is allowed. - assert gbn.can_add_arc('b', 'c') - assert gbn.can_add_arc('d', 'a') - - gbn.add_arc('b', 'd') + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C")]) + assert set(gbn.parents("C")) == set(["B", "D"]) + assert gbn.num_parents("C") == 2 + assert gbn.num_children("D") == 1 + assert gbn.has_arc("D", "C") + + assert gbn.has_path("A", "C") + assert not gbn.has_path("A", "D") + assert gbn.has_path("B", "C") + assert gbn.has_path("D", "C") + + assert not gbn.can_add_arc("C", "A") + # This edge exists, but virtually we consider that the addition is allowed. + assert gbn.can_add_arc("B", "C") + assert gbn.can_add_arc("D", "A") + + gbn.add_arc("B", "D") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('d') == ['b'] - assert gbn.num_parents('d') == 1 - assert gbn.num_children('b') == 2 - assert gbn.has_arc('b', 'd') - - assert gbn.has_path('a', 'd') - assert not gbn.can_add_arc('d', 'a') - assert not gbn.can_flip_arc('b', 'c') - assert gbn.can_flip_arc('a', 'b') + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C"), ("B", "D")]) + assert gbn.parents("D") == ["B"] + assert gbn.num_parents("D") == 1 + assert gbn.num_children("B") == 2 + assert gbn.has_arc("B", "D") + + assert gbn.has_path("A", "D") + assert not gbn.can_add_arc("D", "A") + assert not gbn.can_flip_arc("B", "C") + assert gbn.can_flip_arc("A", "B") # This edge does not exist, but it could be flipped if it did. - assert gbn.can_flip_arc('d', 'a') + assert gbn.can_flip_arc("D", "A") # We can add an edge twice without changes. - gbn.add_arc('b', 'd') + gbn.add_arc("B", "D") assert gbn.num_arcs() == 4 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'c'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('d') == ['b'] - assert gbn.num_parents('d') == 1 - assert gbn.num_children('b') == 2 - assert gbn.has_arc('b', 'd') + assert set(gbn.arcs()) == set([("A", "B"), ("B", "C"), ("D", "C"), ("B", "D")]) + assert gbn.parents("D") == ["B"] + assert gbn.num_parents("D") == 1 + assert gbn.num_children("B") == 2 + assert gbn.has_arc("B", "D") - gbn.remove_arc('b', 'c') + gbn.remove_arc("B", "C") assert gbn.num_arcs() == 3 - assert set(gbn.arcs()) == set([('a', 'b'), ('d', 'c'), ('b', 'd')]) - assert gbn.parents('c') == ['d'] - assert gbn.num_parents('c') == 1 - assert gbn.num_children('b') == 1 - assert not gbn.has_arc('b', 'c') - - assert gbn.can_add_arc('b', 'c') - assert not gbn.can_add_arc('c', 'b') - assert gbn.has_path('a', 'c') - assert gbn.has_path('b', 'c') - - gbn.remove_arc('d', 'c') + assert set(gbn.arcs()) == set([("A", "B"), ("D", "C"), ("B", "D")]) + assert gbn.parents("C") == ["D"] + assert gbn.num_parents("C") == 1 + assert gbn.num_children("B") == 1 + assert not gbn.has_arc("B", "C") + + assert gbn.can_add_arc("B", "C") + assert not gbn.can_add_arc("C", "B") + assert gbn.has_path("A", "C") + assert gbn.has_path("B", "C") + + gbn.remove_arc("D", "C") assert gbn.num_arcs() == 2 - assert set(gbn.arcs()) == set([('a', 'b'), ('b', 'd')]) - assert gbn.parents('c') == [] - assert gbn.num_parents('c') == 0 - assert gbn.num_children('d') == 0 - assert not gbn.has_arc('d', 'c') + assert set(gbn.arcs()) == set([("A", "B"), ("B", "D")]) + assert gbn.parents("C") == [] + assert gbn.num_parents("C") == 0 + assert gbn.num_children("D") == 0 + assert not gbn.has_arc("D", "C") + + assert gbn.can_add_arc("B", "C") + assert gbn.can_add_arc("C", "B") + assert not gbn.has_path("A", "C") + assert not gbn.has_path("B", "C") - assert gbn.can_add_arc('b', 'c') - assert gbn.can_add_arc('c', 'b') - assert not gbn.has_path('a', 'c') - assert not gbn.has_path('b', 'c') def test_bn_fit(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) with pytest.raises(ValueError) as ex: for n in gbn.nodes(): @@ -234,105 +287,123 @@ def test_bn_fit(): assert cpd.evidence() == gbn.parents(n) gbn.fit(df) - - gbn.remove_arc('a', 'b') - cpd_b = gbn.cpd('b') - assert cpd_b.evidence != gbn.parents('b') + gbn.remove_arc("A", "B") + + cpd_b = gbn.cpd("B") + assert cpd_b.evidence != gbn.parents("B") gbn.fit(df) - cpd_b = gbn.cpd('b') - assert cpd_b.evidence() == gbn.parents('b') + cpd_b = gbn.cpd("B") + assert cpd_b.evidence() == gbn.parents("B") + def test_add_cpds(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) - + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) + with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('e', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("E", [])]) assert "variable which is not present" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('a', ['e'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("A", ["E"])]) assert "Evidence variable" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('a', ['b'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("A", ["B"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('b', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("B", [])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('b', ['c'])]) + gbn.add_cpds([pbn.LinearGaussianCPD("B", ["C"])]) assert "CPD do not have the model's parent set as evidence" in str(ex.value) - lg = pbn.LinearGaussianCPD('b', ['a'], [2.5, 1.65], 4) + lg = pbn.LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) assert lg.fitted() gbn.add_cpds([lg]) - cpd_b = gbn.cpd('b') - assert cpd_b.variable() == 'b' - assert cpd_b.evidence() == ['a'] + cpd_b = gbn.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert cpd_b.fitted() assert np.all(cpd_b.beta == np.asarray([2.5, 1.65])) assert cpd_b.variance == 4 with pytest.raises(ValueError) as ex: - cpd_a = gbn.cpd('a') - assert "CPD of variable \"a\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + gbn.cpd("A") + assert ( + 'CPD of variable "A" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - cpd_c = gbn.cpd('c') - assert "CPD of variable \"c\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + gbn.cpd("C") + assert ( + 'CPD of variable "C" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - cpd_d = gbn.cpd('d') - assert "CPD of variable \"d\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + gbn.cpd("D") + assert ( + 'CPD of variable "D" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) with pytest.raises(ValueError) as ex: - gbn.add_cpds([pbn.LinearGaussianCPD('e', [])]) + gbn.add_cpds([pbn.LinearGaussianCPD("E", [])]) assert "variable which is not present" in str(ex.value) def test_bn_logl(): - gbn = GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) gbn.fit(df) - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) ll = gbn.logl(test_df) sll = gbn.slogl(test_df) sum_ll = np.zeros((5000,)) sum_sll = 0 - + for n in gbn.nodes(): cpd = gbn.cpd(n) - l = cpd.logl(test_df) - s = cpd.slogl(test_df) - assert np.all(np.isclose(s, l.sum())) - sum_ll += l - sum_sll += s - + log_likelihood = cpd.logl(test_df) + sum_log_likelihood = cpd.slogl(test_df) + assert np.all(np.isclose(sum_log_likelihood, log_likelihood.sum())) + sum_ll += log_likelihood + sum_sll += sum_log_likelihood + assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) assert sll == sum_sll + def test_bn_sample(): - gbn = GaussianNetwork(['a', 'c', 'b', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + gbn = pbn.GaussianNetwork( + ["A", "C", "B", "D"], + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + ) gbn.fit(df) sample = gbn.sample(1000, 0, False) # Not ordered, so topological sort. - assert sample.schema.names == ['a', 'b', 'c', 'd'] + assert sample.schema.names == ["A", "B", "C", "D"] assert sample.num_rows == 1000 - + sample_ordered = gbn.sample(1000, 0, True) - assert sample_ordered.schema.names == ['a', 'c', 'b', 'd'] + assert sample_ordered.schema.names == ["A", "C", "B", "D"] assert sample_ordered.num_rows == 1000 assert sample.column(0).equals(sample_ordered.column(0)) diff --git a/tests/models/BayesianNetwork_type_test.py b/tests/models/BayesianNetwork_type_test.py index 7f661ec2..4f454628 100644 --- a/tests/models/BayesianNetwork_type_test.py +++ b/tests/models/BayesianNetwork_type_test.py @@ -1,39 +1,38 @@ import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork, ConditionalBayesianNetwork, GaussianNetwork,\ - SemiparametricBN, KDENetwork, DiscreteBN -import util_test +from helpers.data import generate_normal_data_independent + def test_bn_type(): - g1 = GaussianNetwork(["a", "b", "c", "d"]) - g2 = GaussianNetwork(["a", "b", "c", "d"]) - g3 = GaussianNetwork(["a", "b", "c", "d"]) + g1 = pbn.GaussianNetwork(["A", "B", "C", "D"]) + g2 = pbn.GaussianNetwork(["A", "B", "C", "D"]) + g3 = pbn.GaussianNetwork(["A", "B", "C", "D"]) assert g1.type() == pbn.GaussianNetworkType() assert g1.type() == g2.type() assert g1.type() == g3.type() assert g2.type() == g3.type() - s1 = SemiparametricBN(["a", "b", "c", "d"]) - s2 = SemiparametricBN(["a", "b", "c", "d"]) - s3 = SemiparametricBN(["a", "b", "c", "d"]) + s1 = pbn.SemiparametricBN(["A", "B", "C", "D"]) + s2 = pbn.SemiparametricBN(["A", "B", "C", "D"]) + s3 = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert s1.type() == pbn.SemiparametricBNType() assert s1.type() == s2.type() assert s1.type() == s3.type() assert s2.type() == s3.type() - k1 = KDENetwork(["a", "b", "c", "d"]) - k2 = KDENetwork(["a", "b", "c", "d"]) - k3 = KDENetwork(["a", "b", "c", "d"]) + k1 = pbn.KDENetwork(["A", "B", "C", "D"]) + k2 = pbn.KDENetwork(["A", "B", "C", "D"]) + k3 = pbn.KDENetwork(["A", "B", "C", "D"]) assert k1.type() == pbn.KDENetworkType() assert k1.type() == k2.type() assert k1.type() == k3.type() assert k2.type() == k3.type() - d1 = DiscreteBN(["a", "b", "c", "d"]) - d2 = DiscreteBN(["a", "b", "c", "d"]) - d3 = DiscreteBN(["a", "b", "c", "d"]) + d1 = pbn.DiscreteBN(["A", "B", "C", "D"]) + d2 = pbn.DiscreteBN(["A", "B", "C", "D"]) + d3 = pbn.DiscreteBN(["A", "B", "C", "D"]) assert d1.type() == pbn.DiscreteBNType() assert d1.type() == d2.type() @@ -47,16 +46,17 @@ def test_bn_type(): assert s1.type() != d1.type() assert k1.type() != d1.type() + def test_new_bn_type(): - class MyGaussianNetworkType(BayesianNetworkType): + class MyGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True def can_have_arc(self, model, source, target): - return source == "a" + return source == "A" a1 = MyGaussianNetworkType() a2 = MyGaussianNetworkType() @@ -66,10 +66,10 @@ def can_have_arc(self, model, source, target): assert a1 == a3 assert a2 == a3 - class MySemiparametricBNType(BayesianNetworkType): + class MySemiparametricBNType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) - + pbn.BayesianNetworkType.__init__(self) + b1 = MySemiparametricBNType() b2 = MySemiparametricBNType() b3 = MySemiparametricBNType() @@ -80,17 +80,17 @@ def __init__(self): assert a1 != b1 - mybn = BayesianNetwork(a1, ["a", "b", "c", "d"]) + mybn = pbn.BayesianNetwork(a1, ["A", "B", "C", "D"]) - # This type omits the arcs that do not have "a" as source. - assert mybn.can_add_arc("a", "b") - assert not mybn.can_add_arc("b", "a") - assert not mybn.can_add_arc("c", "d") + # This type omits the arcs that do not have "A" as source. + assert mybn.can_add_arc("A", "B") + assert not mybn.can_add_arc("B", "A") + assert not mybn.can_add_arc("C", "D") -class MyRestrictedGaussianNetworkType(BayesianNetworkType): +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True @@ -99,73 +99,84 @@ def default_node_type(self): return pbn.LinearGaussianCPDType() def can_have_arc(self, model, source, target): - return source == "a" + return source == "A" def __str__(self): return "MyRestrictedGaussianNetworkType" -class SpecificNetwork(BayesianNetwork): + +class SpecificNetwork(pbn.BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) -class ConditionalSpecificNetwork(ConditionalBayesianNetwork): + +class ConditionalSpecificNetwork(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface) + pbn.ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface, arcs) + pbn.ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface, arcs + ) + def test_new_specific_bn_type(): - sp1 = SpecificNetwork(["a", "b", "c", "d"]) - sp2 = SpecificNetwork(["a", "b", "c", "d"], [("a", "b")]) - sp3 = SpecificNetwork(["a", "b", "c", "d"]) + sp1 = SpecificNetwork(["A", "B", "C", "D"]) + sp2 = SpecificNetwork(["A", "B", "C", "D"], [("A", "B")]) + sp3 = SpecificNetwork(["A", "B", "C", "D"]) assert sp1.type() == sp2.type() assert sp1.type() == sp3.type() assert sp2.type() == sp3.type() - assert sp1.can_add_arc("a", "b") - assert not sp1.can_add_arc("b", "a") - assert not sp1.can_add_arc("c", "d") + assert sp1.can_add_arc("A", "B") + assert not sp1.can_add_arc("B", "A") + assert not sp1.can_add_arc("C", "D") assert sp1.num_arcs() == sp3.num_arcs() == 0 - assert sp2.arcs() == [("a", "b")] + assert sp2.arcs() == [("A", "B")] - df = util_test.generate_normal_data_indep(1000) + df = generate_normal_data_independent(1000) bic = pbn.BIC(df) - start = SpecificNetwork(["a", "b", "c", "d"]) + start = SpecificNetwork(["A", "B", "C", "D"]) hc = pbn.GreedyHillClimbing() estimated = hc.estimate(pbn.ArcOperatorSet(), bic, start) assert estimated.type() == start.type() - assert all([s == "a" for s, t in estimated.arcs()]) + assert all([s == "A" for s, t in estimated.arcs()]) # ####################### # Conditional BN # ####################### - - csp1 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) - csp2 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"], [("a", "b")]) - csp3 = ConditionalSpecificNetwork(["a", "b"], ["c", "d"]) + + csp1 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"]) + csp2 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"], [("A", "B")]) + csp3 = ConditionalSpecificNetwork(["A", "B"], ["C", "D"]) assert csp1.type() == csp2.type() assert csp1.type() == csp3.type() assert csp2.type() == csp3.type() - assert csp1.can_add_arc("a", "b") - assert not csp1.can_add_arc("b", "a") - assert not csp1.can_add_arc("c", "d") + assert csp1.can_add_arc("A", "B") + assert not csp1.can_add_arc("B", "A") + assert not csp1.can_add_arc("C", "D") assert csp1.num_arcs() == csp3.num_arcs() == 0 - assert csp2.arcs() == [("a", "b")] + assert csp2.arcs() == [("A", "B")] - cstart = ConditionalSpecificNetwork(["a", "c"], ["b", "d"]) + cstart = ConditionalSpecificNetwork(["A", "C"], ["B", "D"]) hc = pbn.GreedyHillClimbing() cestimated = hc.estimate(pbn.ArcOperatorSet(), bic, cstart) assert cestimated.type() == cstart.type() - assert all([s == "a" for s, t in cestimated.arcs()]) + assert all([s == "A" for s, t in cestimated.arcs()]) diff --git a/tests/models/DynamicBayesianNetwork_test.py b/tests/models/DynamicBayesianNetwork_test.py index 892f96a2..bba9e5bf 100644 --- a/tests/models/DynamicBayesianNetwork_test.py +++ b/tests/models/DynamicBayesianNetwork_test.py @@ -1,20 +1,21 @@ -import pytest import re + import numpy as np import pandas as pd -from scipy.stats import norm import pybnesian as pbn -from pybnesian import GaussianNetwork, ConditionalGaussianNetwork, DynamicGaussianNetwork -import util_test +import pytest +from helpers.data import generate_normal_data +from scipy.stats import norm + +df = generate_normal_data(1000) -df = util_test.generate_normal_data(1000) def test_create_dbn(): - variables = ["a", "b", "c", "d"] - gbn = DynamicGaussianNetwork(variables, 2) + variables = ["A", "B", "C", "D"] + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 - assert gbn.variables() == ["a", "b", "c", "d"] + assert gbn.variables() == ["A", "B", "C", "D"] assert gbn.num_variables() == 4 assert gbn.type() == pbn.GaussianNetworkType() @@ -25,52 +26,67 @@ def test_create_dbn(): assert set(gbn.transition_bn().interface_nodes()) == set(static_nodes) assert set(gbn.transition_bn().nodes()) == set(transition_nodes) - static_bn = GaussianNetwork(static_nodes) - transition_bn = ConditionalGaussianNetwork(transition_nodes, static_nodes) + static_bn = pbn.GaussianNetwork(static_nodes) + transition_bn = pbn.ConditionalGaussianNetwork(transition_nodes, static_nodes) - gbn2 = DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) + gbn2 = pbn.DynamicGaussianNetwork(variables, 2, static_bn, transition_bn) + assert gbn2.markovian_order() == 2 + assert gbn2.variables() == ["A", "B", "C", "D"] + assert gbn2.num_variables() == 4 + assert gbn2.type() == pbn.GaussianNetworkType() wrong_transition_bn = pbn.ConditionalDiscreteBN(transition_nodes, static_nodes) with pytest.raises(ValueError) as ex: - gbn3 = DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) - assert "Static and transition Bayesian networks do not have the same type" in str(ex.value) + pbn.DynamicGaussianNetwork(variables, 2, static_bn, wrong_transition_bn) + assert "Static and transition Bayesian networks do not have the same type" in str( + ex.value + ) wrong_static_bn = pbn.DiscreteBN(static_nodes) with pytest.raises(ValueError) as ex: - gbn4 = DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) + pbn.DynamicGaussianNetwork(variables, 2, wrong_static_bn, wrong_transition_bn) assert "Bayesian networks are not Gaussian." in str(ex.value) + def test_variable_operations_dbn(): - variables = ["a", "b", "c", "d"] - gbn = DynamicGaussianNetwork(variables, 2) + variables = ["A", "B", "C", "D"] + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert gbn.markovian_order() == 2 - assert gbn.variables() == ["a", "b", "c", "d"] + assert gbn.variables() == ["A", "B", "C", "D"] assert gbn.num_variables() == 4 - assert gbn.contains_variable("a") - assert gbn.contains_variable("b") - assert gbn.contains_variable("c") - assert gbn.contains_variable("d") + assert gbn.contains_variable("A") + assert gbn.contains_variable("B") + assert gbn.contains_variable("C") + assert gbn.contains_variable("D") - gbn.add_variable("e") - assert set(gbn.variables()) == set(["a", "b", "c", "d", "e"]) + gbn.add_variable("E") + assert set(gbn.variables()) == set(["A", "B", "C", "D", "E"]) assert gbn.num_variables() == 5 - assert set(gbn.static_bn().nodes()) == set([v + "_t_" + str(m) for v in variables + ["e"] for m in range(1, 3)]) - assert set(gbn.transition_bn().nodes()) == set([v + "_t_0" for v in variables + ["e"]]) + assert set(gbn.static_bn().nodes()) == set( + [v + "_t_" + str(m) for v in variables + ["E"] for m in range(1, 3)] + ) + assert set(gbn.transition_bn().nodes()) == set( + [v + "_t_0" for v in variables + ["E"]] + ) - gbn.remove_variable("b") - assert set(gbn.variables()) == set(["a", "c", "d", "e"]) + gbn.remove_variable("B") + assert set(gbn.variables()) == set(["A", "C", "D", "E"]) assert gbn.num_variables() == 4 - assert set(gbn.static_bn().nodes()) == set([v + "_t_" + str(m) for v in ["a", "c", "d", "e"] for m in range(1, 3)]) - assert set(gbn.transition_bn().nodes()) == set([v + "_t_0" for v in ["a", "c", "d", "e"]]) + assert set(gbn.static_bn().nodes()) == set( + [v + "_t_" + str(m) for v in ["A", "C", "D", "E"] for m in range(1, 3)] + ) + assert set(gbn.transition_bn().nodes()) == set( + [v + "_t_0" for v in ["A", "C", "D", "E"]] + ) def test_fit_dbn(): - variables = ["a", "b", "c", "d"] - gbn = DynamicGaussianNetwork(variables, 2) + variables = ["A", "B", "C", "D"] + gbn = pbn.DynamicGaussianNetwork(variables, 2) assert not gbn.fitted() assert not gbn.static_bn().fitted() assert not gbn.transition_bn().fitted() @@ -78,7 +94,7 @@ def test_fit_dbn(): assert gbn.fitted() ddf = pbn.DynamicDataFrame(df, 2) - gbn2 = DynamicGaussianNetwork(variables, 2) + gbn2 = pbn.DynamicGaussianNetwork(variables, 2) gbn2.static_bn().fit(ddf.static_df()) assert not gbn2.fitted() assert gbn2.static_bn().fitted() @@ -89,10 +105,12 @@ def test_fit_dbn(): assert gbn2.static_bn().fitted() assert gbn2.transition_bn().fitted() + def lg_logl_row(row, variable, evidence, beta, variance): m = beta[0] + beta[1:].dot(row[evidence]) return norm(m, np.sqrt(variance)).logpdf(row[variable]) + def static_logl(dbn, test_data, index, variable): sl = test_data.head(dbn.markovian_order()) @@ -102,16 +120,18 @@ def static_logl(dbn, test_data, index, variable): row_values = [sl.loc[index, variable]] for e in evidence: - m = re.search('(.*)_t_(\\d+)', e) - e_var = m[1] - t = int(m[2]) + m = re.search("(.*)_t_(\\d+)", e) + if m: + e_var = m.group(1) + t = int(m.group(2)) - row_values.append(sl.loc[dbn.markovian_order()-t, e_var]) + row_values.append(sl.loc[dbn.markovian_order() - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) return lg_logl_row(r, node_name, evidence, cpd.beta, cpd.variance) + def transition_logl(dbn, test_data, index, variable): node_name = variable + "_t_0" cpd = dbn.transition_bn().cpd(node_name) @@ -119,11 +139,12 @@ def transition_logl(dbn, test_data, index, variable): row_values = [test_data.loc[index, variable]] for e in evidence: - m = re.search('(.*)_t_(\\d+)', e) - e_var = m[1] - t = int(m[2]) + m = re.search("(.*)_t_(\\d+)", e) + if m: + e_var = m.group(1) + t = int(m.group(2)) - row_values.append(test_data.loc[index-t, e_var]) + row_values.append(test_data.loc[index - t, e_var]) r = pd.Series(data=row_values, index=[node_name] + evidence) return lg_logl_row(r, node_name, evidence, cpd.beta, cpd.variance) @@ -142,64 +163,74 @@ def numpy_logl(dbn, test_data): return ll + def test_logl_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - gbn = DynamicGaussianNetwork(variables, 2) + static_bn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] + ) + static_bn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] + ) + gbn = pbn.DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() - static_bn.add_arc("a_t_2", "c_t_2") - static_bn.add_arc("b_t_2", "c_t_2") - static_bn.add_arc("c_t_2", "d_t_2") - static_bn.add_arc("a_t_1", "c_t_1") - static_bn.add_arc("b_t_1", "c_t_1") - static_bn.add_arc("c_t_1", "d_t_1") + static_bn.add_arc("A_t_2", "C_t_2") + static_bn.add_arc("B_t_2", "C_t_2") + static_bn.add_arc("C_t_2", "D_t_2") + static_bn.add_arc("A_t_1", "C_t_1") + static_bn.add_arc("B_t_1", "C_t_1") + static_bn.add_arc("C_t_1", "D_t_1") transition_bn = gbn.transition_bn() - transition_bn.add_arc("a_t_2", "a_t_0") - transition_bn.add_arc("b_t_2", "b_t_0") - transition_bn.add_arc("c_t_2", "c_t_0") - transition_bn.add_arc("d_t_2", "d_t_0") - transition_bn.add_arc("a_t_1", "a_t_0") - transition_bn.add_arc("b_t_1", "b_t_0") - transition_bn.add_arc("c_t_1", "c_t_0") - transition_bn.add_arc("d_t_1", "d_t_0") + transition_bn.add_arc("A_t_2", "A_t_0") + transition_bn.add_arc("B_t_2", "B_t_0") + transition_bn.add_arc("C_t_2", "C_t_0") + transition_bn.add_arc("D_t_2", "D_t_0") + transition_bn.add_arc("A_t_1", "A_t_0") + transition_bn.add_arc("B_t_1", "B_t_0") + transition_bn.add_arc("C_t_1", "C_t_0") + transition_bn.add_arc("D_t_1", "D_t_0") gbn.fit(df) - test_df = util_test.generate_normal_data(100) - ground_truth_ll = numpy_logl(gbn, util_test.generate_normal_data(100)) + test_df = generate_normal_data(100) + ground_truth_ll = numpy_logl(gbn, generate_normal_data(100)) ll = gbn.logl(test_df) assert np.all(np.isclose(ground_truth_ll, ll)) + def test_slogl_dbn(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - static_bn = GaussianNetwork(["a", "b", "c", "d"], [("a", "c"), ("b", "c"), ("c", "d")]) - gbn = DynamicGaussianNetwork(variables, 2) + static_bn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] + ) + static_bn = pbn.GaussianNetwork( + ["A", "B", "C", "D"], [("A", "C"), ("B", "C"), ("C", "D")] + ) + gbn = pbn.DynamicGaussianNetwork(variables, 2) static_bn = gbn.static_bn() - static_bn.add_arc("a_t_2", "c_t_2") - static_bn.add_arc("b_t_2", "c_t_2") - static_bn.add_arc("c_t_2", "d_t_2") - static_bn.add_arc("a_t_1", "c_t_1") - static_bn.add_arc("b_t_1", "c_t_1") - static_bn.add_arc("c_t_1", "d_t_1") + static_bn.add_arc("A_t_2", "C_t_2") + static_bn.add_arc("B_t_2", "C_t_2") + static_bn.add_arc("C_t_2", "D_t_2") + static_bn.add_arc("A_t_1", "C_t_1") + static_bn.add_arc("B_t_1", "C_t_1") + static_bn.add_arc("C_t_1", "D_t_1") transition_bn = gbn.transition_bn() - transition_bn.add_arc("a_t_2", "a_t_0") - transition_bn.add_arc("b_t_2", "b_t_0") - transition_bn.add_arc("c_t_2", "c_t_0") - transition_bn.add_arc("d_t_2", "d_t_0") - transition_bn.add_arc("a_t_1", "a_t_0") - transition_bn.add_arc("b_t_1", "b_t_0") - transition_bn.add_arc("c_t_1", "c_t_0") - transition_bn.add_arc("d_t_1", "d_t_0") + transition_bn.add_arc("A_t_2", "A_t_0") + transition_bn.add_arc("B_t_2", "B_t_0") + transition_bn.add_arc("C_t_2", "C_t_0") + transition_bn.add_arc("D_t_2", "D_t_0") + transition_bn.add_arc("A_t_1", "A_t_0") + transition_bn.add_arc("B_t_1", "B_t_0") + transition_bn.add_arc("C_t_1", "C_t_0") + transition_bn.add_arc("D_t_1", "D_t_0") gbn.fit(df) - test_df = util_test.generate_normal_data(100) + test_df = generate_normal_data(100) ll = numpy_logl(gbn, test_df) - assert np.isclose(gbn.slogl(test_df), ll.sum()) \ No newline at end of file + assert np.isclose(gbn.slogl(test_df), ll.sum()) diff --git a/tests/models/HeterogeneousBN_test.py b/tests/models/HeterogeneousBN_test.py index ca3614b7..72ea865e 100644 --- a/tests/models/HeterogeneousBN_test.py +++ b/tests/models/HeterogeneousBN_test.py @@ -1,55 +1,73 @@ -import pybnesian as pbn import pyarrow as pa +import pybnesian as pbn def test_type_equality(): - # + # # Test single vector types - # + # - het_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) - het2_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) + het_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] + ) + het2_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] + ) assert het_single.type() == het2_single.type() - het3_single = pbn.HeterogeneousBN([pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["a", "b", "c", "d"]) - + het3_single = pbn.HeterogeneousBN( + [pbn.LinearGaussianCPDType(), pbn.CKDEType()], ["A", "B", "C", "D"] + ) + assert het_single.type() != het3_single.type() - # + # # Test a single vector type for each data type - # - - het_dt = pbn.HeterogeneousBN({ - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()] - }, ["a", "b", "c", "d"]) - - het2_dt = pbn.HeterogeneousBN({ - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], - pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) - + # + + het_dt = pbn.HeterogeneousBN( + { + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + }, + ["A", "B", "C", "D"], + ) + + het2_dt = pbn.HeterogeneousBN( + { + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + pa.float32(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + }, + ["A", "B", "C", "D"], + ) + # The order of the set is not relevant assert het_dt.type() == het2_dt.type() - het3_dt = pbn.HeterogeneousBN({ - pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], - pa.float32(): [pbn.LinearGaussianCPDType(), pbn.CKDEType()], - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) + het3_dt = pbn.HeterogeneousBN( + { + pa.dictionary(pa.int8(), pa.string()): [pbn.DiscreteFactorType()], + pa.float32(): [pbn.LinearGaussianCPDType(), pbn.CKDEType()], + pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()], + }, + ["A", "B", "C", "D"], + ) # The order of the default FactorTypes is relevant assert het_dt.type() != het3_dt.type() - - # + + # # Compare single vector and multi vector FactorTypes - het_single = pbn.HeterogeneousBN([pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["a", "b", "c", "d"]) - het_dt = pbn.HeterogeneousBN({ - pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()] - }, ["a", "b", "c", "d"]) + het_single = pbn.HeterogeneousBN( + [pbn.CKDEType(), pbn.LinearGaussianCPDType()], ["A", "B", "C", "D"] + ) + het_dt = pbn.HeterogeneousBN( + {pa.float64(): [pbn.CKDEType(), pbn.LinearGaussianCPDType()]}, + ["A", "B", "C", "D"], + ) assert het_single.type() != het_dt.type() diff --git a/tests/models/SemiparametricBN_test.py b/tests/models/SemiparametricBN_test.py index 8540b93f..5d79219d 100644 --- a/tests/models/SemiparametricBN_test.py +++ b/tests/models/SemiparametricBN_test.py @@ -1,117 +1,145 @@ -import pytest import numpy as np import pybnesian as pbn -from pybnesian import SemiparametricBN, LinearGaussianCPD, CKDE -import util_test +import pytest +from helpers.data import DATA_SIZE, generate_normal_data + +df = generate_normal_data(DATA_SIZE) -df = util_test.generate_normal_data(10000) def test_create_spbn(): - spbn = SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'c')]) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"], [("A", "C")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - spbn = SemiparametricBN([('a', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = pbn.SemiparametricBN([("A", "C"), ("B", "D"), ("C", "D")]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ['a', 'c', 'b', 'd'] + assert spbn.nodes() == ["A", "C", "B", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'c', 'b')]) + spbn = pbn.SemiparametricBN(["A", "B", "C"], [("A", "C", "B")]) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'd')]) + spbn = pbn.SemiparametricBN(["A", "B", "C"], [("A", "D")]) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([('a', 'b'), ('b', 'c'), ('c', 'a')]) + spbn = pbn.SemiparametricBN([("A", "B"), ("B", "C"), ("C", "A")]) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'b'), ('b', 'c'), ('c', 'a')]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], [("A", "B"), ("B", "C"), ("C", "A")] + ) assert "must be a DAG" in str(ex.value) + expected_node_type = { + "A": pbn.CKDEType(), + "B": pbn.UnknownFactorType(), + "C": pbn.CKDEType(), + "D": pbn.UnknownFactorType(), + } - expected_node_type = {'a': pbn.CKDEType(), - 'b': pbn.UnknownFactorType(), - 'c': pbn.CKDEType(), - 'd': pbn.UnknownFactorType()} - - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], [("A", pbn.CKDEType()), ("C", pbn.CKDEType())] + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], [('a', 'c')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], + [("A", "C")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 1 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] - spbn = SemiparametricBN([('a', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("A", "C"), ("B", "D"), ("C", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 3 - assert spbn.nodes() == ['a', 'c', 'b', 'd'] + assert spbn.nodes() == ["A", "C", "B", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == expected_node_type[n] with pytest.raises(TypeError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'c', 'b')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C"], + [("A", "C", "B")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert "incompatible constructor arguments" in str(ex.value) - + with pytest.raises(IndexError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c'], [('a', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C"], + [("A", "D")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert "not present in the graph" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN([('a', 'b'), ('b', 'c'), ('c', 'a')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("B", "C"), ("C", "A")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert "must be a DAG" in str(ex.value) with pytest.raises(ValueError) as ex: - spbn = SemiparametricBN(['a', 'b', 'c', 'd'], - [('a', 'b'), ('b', 'c'), ('c', 'a')], - [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], + [("A", "B"), ("B", "C"), ("C", "A")], + [("A", pbn.CKDEType()), ("C", pbn.CKDEType())], + ) assert "must be a DAG" in str(ex.value) def test_node_type(): - spbn = SemiparametricBN(['a', 'b', 'c', 'd']) + spbn = pbn.SemiparametricBN(["A", "B", "C", "D"]) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 - assert spbn.nodes() == ['a', 'b', 'c', 'd'] + assert spbn.nodes() == ["A", "B", "C", "D"] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() - - spbn.set_node_type('b', pbn.CKDEType()) - assert spbn.node_type('b') == pbn.CKDEType() - spbn.set_node_type('b', pbn.LinearGaussianCPDType()) - assert spbn.node_type('b') == pbn.LinearGaussianCPDType() + + spbn.set_node_type("B", pbn.CKDEType()) + assert spbn.node_type("B") == pbn.CKDEType() + spbn.set_node_type("B", pbn.LinearGaussianCPDType()) + assert spbn.node_type("B") == pbn.LinearGaussianCPDType() + def test_fit(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) with pytest.raises(ValueError) as ex: for n in spbn.nodes(): @@ -129,99 +157,115 @@ def test_fit(): assert set(cpd.evidence()) == set(spbn.parents(n)) spbn.fit(df) - - spbn.remove_arc('a', 'b') - cpd_b = spbn.cpd('b') + spbn.remove_arc("A", "B") + + cpd_b = spbn.cpd("B") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence != spbn.parents('b') + assert cpd_b.evidence != spbn.parents("B") spbn.fit(df) - cpd_b = spbn.cpd('b') + cpd_b = spbn.cpd("B") assert type(cpd_b) == pbn.LinearGaussianCPD - assert cpd_b.evidence() == spbn.parents('b') + assert cpd_b.evidence() == spbn.parents("B") - spbn.set_node_type('c', pbn.CKDEType()) + spbn.set_node_type("C", pbn.CKDEType()) with pytest.raises(ValueError) as ex: - cpd_c = spbn.cpd('c') + cpd_c = spbn.cpd("C") assert "not added" in str(ex.value) spbn.fit(df) - cpd_c = spbn.cpd('c') - assert cpd_c.type() == spbn.node_type('c') + cpd_c = spbn.cpd("C") + assert cpd_c.type() == spbn.node_type("C") def test_cpd(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("D", pbn.CKDEType())], + ) with pytest.raises(ValueError) as ex: - spbn.cpd('a') + spbn.cpd("A") assert "not added" in str(ex.value) spbn.fit(df) - assert spbn.cpd('a').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('b').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('c').type() == pbn.LinearGaussianCPDType() - assert spbn.cpd('d').type() == pbn.CKDEType() + assert spbn.cpd("A").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("B").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("C").type() == pbn.LinearGaussianCPDType() + assert spbn.cpd("D").type() == pbn.CKDEType() + + assert spbn.cpd("A").fitted() + assert spbn.cpd("B").fitted() + assert spbn.cpd("C").fitted() + assert spbn.cpd("D").fitted() - assert spbn.cpd('a').fitted() - assert spbn.cpd('b').fitted() - assert spbn.cpd('c').fitted() - assert spbn.cpd('d').fitted() def test_add_cpds(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")], + [("D", pbn.CKDEType())], + ) - assert spbn.node_type('a') == pbn.UnknownFactorType() - spbn.add_cpds([CKDE('a', [])]) - assert spbn.node_type('a') == pbn.CKDEType() + assert spbn.node_type("A") == pbn.UnknownFactorType() + spbn.add_cpds([pbn.CKDE("A", [])]) + assert spbn.node_type("A") == pbn.CKDEType() with pytest.raises(ValueError) as ex: - spbn.add_cpds([LinearGaussianCPD('d', ['a', 'b', 'c'])]) + spbn.add_cpds([pbn.LinearGaussianCPD("D", ["A", "B", "C"])]) assert "Bayesian network expects type" in str(ex.value) - lg = LinearGaussianCPD('b', ['a'], [2.5, 1.65], 4) - ckde = CKDE('d', ['a', 'b', 'c']) + lg = pbn.LinearGaussianCPD("B", ["A"], [2.5, 1.65], 4) + ckde = pbn.CKDE("D", ["A", "B", "C"]) assert lg.fitted() assert not ckde.fitted() spbn.add_cpds([lg, ckde]) - spbn.set_node_type('a', pbn.UnknownFactorType()) + spbn.set_node_type("A", pbn.UnknownFactorType()) with pytest.raises(ValueError) as ex: - not spbn.cpd('a').fitted() - assert "CPD of variable \"a\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + spbn.cpd("A").fitted() + assert ( + 'CPD of variable "A" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) - assert spbn.cpd('b').fitted() + assert spbn.cpd("B").fitted() with pytest.raises(ValueError) as ex: - not spbn.cpd('c').fitted() - assert "CPD of variable \"c\" not added. Call add_cpds() or fit() to add the CPD." in str(ex.value) + spbn.cpd("C").fitted() + assert ( + 'CPD of variable "C" not added. Call add_cpds() or fit() to add the CPD.' + in str(ex.value) + ) + + assert not spbn.cpd("D").fitted() - assert not spbn.cpd('d').fitted() def test_logl(): - spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) + spbn = pbn.SemiparametricBN( + [("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D")] + ) spbn.fit(df) - test_df = util_test.generate_normal_data(5000) + test_df = generate_normal_data(5000) ll = spbn.logl(test_df) sll = spbn.slogl(test_df) sum_ll = np.zeros((5000,)) sum_sll = 0 - + for n in spbn.nodes(): cpd = spbn.cpd(n) - l = cpd.logl(test_df) - s = cpd.slogl(test_df) - assert np.all(np.isclose(s, l.sum())) - sum_ll += l - sum_sll += s - + log_likelihood = cpd.logl(test_df) + sum_log_likelihood = cpd.slogl(test_df) + assert np.all(np.isclose(sum_log_likelihood, log_likelihood.sum())) + sum_ll += log_likelihood + sum_sll += sum_log_likelihood + assert np.all(np.isclose(ll, sum_ll)) assert np.isclose(sll, ll.sum()) - assert sll == sum_sll \ No newline at end of file + assert sll == sum_sll diff --git a/tests/serialization/serialize_factor_test.py b/tests/serialization/serialize_factor_test.py index f6e34256..2e7b5481 100644 --- a/tests/serialization/serialize_factor_test.py +++ b/tests/serialization/serialize_factor_test.py @@ -1,28 +1,32 @@ +import pickle + import numpy as np import pandas as pd -import pytest import pybnesian as pbn -from pybnesian import FactorType, Factor, LinearGaussianCPD, CKDE, DiscreteFactor -import pickle +import pytest + @pytest.fixture def lg_bytes(): - lg = LinearGaussianCPD("c", ["a", "b"]) + lg = pbn.LinearGaussianCPD("C", ["A", "B"]) return pickle.dumps(lg) + @pytest.fixture def ckde_bytes(): - ckde = CKDE("c", ["a", "b"]) + ckde = pbn.CKDE("C", ["A", "B"]) return pickle.dumps(ckde) + @pytest.fixture def discrete_bytes(): - discrete = DiscreteFactor("c", ["a", "b"]) + discrete = pbn.DiscreteFactor("C", ["A", "B"]) return pickle.dumps(discrete) -class NewType(FactorType): + +class NewType(pbn.FactorType): def __init__(self, factor_class): - FactorType.__init__(self) + pbn.FactorType.__init__(self) self.factor_class = factor_class def new_factor(self, model, variable, evidence): @@ -31,16 +35,17 @@ def new_factor(self, model, variable, evidence): def __str__(self): return "NewType" -class NewFactor(Factor): + +class NewFactor(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) self._fitted = False self.some_fit_data = None - + def fit(self, df): self.some_fit_data = "fitted" self._fitted = True - + def fitted(self): return self._fitted @@ -51,23 +56,24 @@ def __str__(self): return "NewFactor" def __getstate_extra__(self): - d = {'fitted': self._fitted, 'some_fit_data': self.some_fit_data} + d = {"fitted": self._fitted, "some_fit_data": self.some_fit_data} return d def __setstate_extra__(self, d): - self._fitted = d['fitted'] - self.some_fit_data = d['some_fit_data'] + self._fitted = d["fitted"] + self.some_fit_data = d["some_fit_data"] + -class NewFactorBis(Factor): +class NewFactorBis(pbn.Factor): def __init__(self, variable, evidence): - Factor.__init__(self, variable, evidence) + pbn.Factor.__init__(self, variable, evidence) self._fitted = False self.some_fit_data = None def fit(self, df): self.some_fit_data = "fitted" self._fitted = True - + def fitted(self): return self._fitted @@ -78,66 +84,72 @@ def __str__(self): return "NewFactor" def __getstate__(self): - d = {'variable': self.variable(), - 'evidence': self.evidence(), - 'fitted': self._fitted, - 'some_fit_data': self.some_fit_data} + d = { + "variable": self.variable(), + "evidence": self.evidence(), + "fitted": self._fitted, + "some_fit_data": self.some_fit_data, + } return d def __setstate__(self, d): - Factor.__init__(self, d['variable'], d['evidence']) - self._fitted = d['fitted'] - self.some_fit_data = d['some_fit_data'] + pbn.Factor.__init__(self, d["variable"], d["evidence"]) + self._fitted = d["fitted"] + self.some_fit_data = d["some_fit_data"] + @pytest.fixture def new_bytes(): - n = NewFactor("c", ["a", "b"]) + n = NewFactor("C", ["A", "B"]) return pickle.dumps(n) + @pytest.fixture def newbis_bytes(): - n = NewFactorBis("c", ["a", "b"]) + n = NewFactorBis("C", ["A", "B"]) return pickle.dumps(n) -def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes): + +def test_serialization_unfitted_factor( + lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes +): loaded_lg = pickle.loads(lg_bytes) - assert loaded_lg.variable() == "c" - assert set(loaded_lg.evidence()) == set(["a", "b"]) + assert loaded_lg.variable() == "C" + assert set(loaded_lg.evidence()) == set(["A", "B"]) assert not loaded_lg.fitted() assert loaded_lg.type() == pbn.LinearGaussianCPDType() loaded_ckde = pickle.loads(ckde_bytes) - assert loaded_ckde.variable() == "c" - assert set(loaded_ckde.evidence()) == set(["a", "b"]) + assert loaded_ckde.variable() == "C" + assert set(loaded_ckde.evidence()) == set(["A", "B"]) assert not loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() loaded_discrete = pickle.loads(discrete_bytes) - assert loaded_discrete.variable() == "c" - assert set(loaded_discrete.evidence()) == set(["a", "b"]) + assert loaded_discrete.variable() == "C" + assert set(loaded_discrete.evidence()) == set(["A", "B"]) assert not loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() loaded_new = pickle.loads(new_bytes) - assert loaded_new.variable() == "c" - assert set(loaded_new.evidence()) == set(["a", "b"]) + assert loaded_new.variable() == "C" + assert set(loaded_new.evidence()) == set(["A", "B"]) assert not loaded_new.fitted() assert type(loaded_new.type()) == NewType - nn = NewFactor("a", []) + nn = NewFactor("A", []) assert loaded_new.type() == nn.type() - from pybnesian import GaussianNetwork - dummy_network = GaussianNetwork(["a", "b", "c", "d"]) - assert type(loaded_new.type().new_factor(dummy_network, "a", [])) == NewFactor + dummy_network = pbn.GaussianNetwork(["A", "B", "C", "D"]) + assert type(loaded_new.type().new_factor(dummy_network, "A", [])) == NewFactor loaded_newbis = pickle.loads(newbis_bytes) - assert loaded_newbis.variable() == "c" - assert set(loaded_newbis.evidence()) == set(["a", "b"]) + assert loaded_newbis.variable() == "C" + assert set(loaded_newbis.evidence()) == set(["A", "B"]) assert not loaded_newbis.fitted() assert type(loaded_newbis.type()) == NewType - nnbis = NewFactorBis("a", []) + nnbis = NewFactorBis("A", []) assert loaded_newbis.type() == nnbis.type() - assert type(loaded_newbis.type().new_factor(dummy_network, "a", [])) == NewFactorBis + assert type(loaded_newbis.type().new_factor(dummy_network, "A", [])) == NewFactorBis assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() @@ -147,96 +159,119 @@ def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new assert loaded_discrete.type() != loaded_new.type() assert loaded_newbis.type() == loaded_new.type() + @pytest.fixture def lg_fitted_bytes(): - lg = LinearGaussianCPD("c", ["a", "b"], [1, 2, 3], 0.5) + lg = pbn.LinearGaussianCPD("C", ["A", "B"], [1, 2, 3], 0.5) return pickle.dumps(lg) + @pytest.fixture def ckde_fitted_bytes(): np.random.seed(1) - data = pd.DataFrame({'a': np.random.rand(10), 'b': np.random.rand(10), 'c': np.random.rand(10)}).astype(float) - ckde = CKDE("c", ["a", "b"]) + data = pd.DataFrame( + {"A": np.random.rand(10), "B": np.random.rand(10), "C": np.random.rand(10)} + ).astype(float) + ckde = pbn.CKDE("C", ["A", "B"]) ckde.fit(data) return pickle.dumps(ckde) + @pytest.fixture def discrete_fitted_bytes(): - discrete = DiscreteFactor("c", ["a", "b"]) - - data = pd.DataFrame({'a': ["a1", "a2", "a1", "a2", "a2", "a2", "a2", "a2"], - 'b': ["b1", "b1", "b1", "b1", "b1", "b2", "b1", "b2"], - 'c': ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"]}, dtype="category") + discrete = pbn.DiscreteFactor("C", ["A", "B"]) + + data = pd.DataFrame( + { + "A": ["A1", "A2", "A1", "A2", "A2", "A2", "A2", "A2"], + "B": ["B1", "B1", "B1", "B1", "B1", "B2", "B1", "B2"], + "C": ["C1", "C1", "C1", "C1", "C2", "C2", "C2", "C2"], + }, + dtype="category", + ) discrete.fit(data) return pickle.dumps(discrete) + @pytest.fixture def new_fitted_bytes(): - n = NewFactor("c", ["a", "b"]) + n = NewFactor("C", ["A", "B"]) n.fit(None) return pickle.dumps(n) + @pytest.fixture def newbis_fitted_bytes(): - n = NewFactorBis("c", ["a", "b"]) + n = NewFactorBis("C", ["A", "B"]) n.fit(None) return pickle.dumps(n) -def test_serialization_fitted_factor(lg_fitted_bytes, ckde_fitted_bytes, discrete_fitted_bytes, new_fitted_bytes, - newbis_fitted_bytes): + +def test_serialization_fitted_factor( + lg_fitted_bytes, + ckde_fitted_bytes, + discrete_fitted_bytes, + new_fitted_bytes, + newbis_fitted_bytes, +): loaded_lg = pickle.loads(lg_fitted_bytes) - assert loaded_lg.variable() == "c" - assert set(loaded_lg.evidence()) == set(["a", "b"]) + assert loaded_lg.variable() == "C" + assert set(loaded_lg.evidence()) == set(["A", "B"]) assert loaded_lg.fitted() assert list(loaded_lg.beta) == [1, 2, 3] assert loaded_lg.variance == 0.5 loaded_ckde = pickle.loads(ckde_fitted_bytes) - assert loaded_ckde.variable() == "c" - assert set(loaded_ckde.evidence()) == set(["a", "b"]) + assert loaded_ckde.variable() == "C" + assert set(loaded_ckde.evidence()) == set(["A", "B"]) assert loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() assert loaded_ckde.num_instances() == 10 tr = loaded_ckde.kde_joint().dataset().to_pandas() np.random.seed(1) - assert np.all(tr['a'] == np.random.rand(10)) - assert np.all(tr['b'] == np.random.rand(10)) - assert np.all(tr['c'] == np.random.rand(10)) + assert np.all(tr["A"] == np.random.rand(10)) + assert np.all(tr["B"] == np.random.rand(10)) + assert np.all(tr["C"] == np.random.rand(10)) loaded_discrete = pickle.loads(discrete_fitted_bytes) - assert loaded_discrete.variable() == "c" - assert set(loaded_discrete.evidence()) == set(["a", "b"]) + assert loaded_discrete.variable() == "C" + assert set(loaded_discrete.evidence()) == set(["A", "B"]) assert loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() - test = pd.DataFrame({'a': ["a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2"], - 'b': ["b1", "b1", "b2", "b2", "b1", "b1", "b2", "b2"], - 'c': ["c1", "c1", "c1", "c1", "c2", "c2", "c2", "c2"]}, dtype="category") + test = pd.DataFrame( + { + "A": ["A1", "A2", "A1", "A2", "A1", "A2", "A1", "A2"], + "B": ["B1", "B1", "B2", "B2", "B1", "B1", "B2", "B2"], + "C": ["C1", "C1", "C1", "C1", "C2", "C2", "C2", "C2"], + }, + dtype="category", + ) ll = loaded_discrete.logl(test) assert list(np.exp(ll)) == [1, 0.5, 0.5, 0, 0, 0.5, 0.5, 1] loaded_new = pickle.loads(new_fitted_bytes) - assert loaded_new.variable() == "c" - assert set(loaded_new.evidence()) == set(["a", "b"]) + assert loaded_new.variable() == "C" + assert set(loaded_new.evidence()) == set(["A", "B"]) assert loaded_new.fitted() assert type(loaded_new.type()) == NewType - nn = NewFactor("a", []) + nn = NewFactor("A", []) assert loaded_new.type() == nn.type() assert loaded_new.some_fit_data == "fitted" loaded_newbis = pickle.loads(newbis_fitted_bytes) - assert loaded_newbis.variable() == "c" - assert set(loaded_newbis.evidence()) == set(["a", "b"]) + assert loaded_newbis.variable() == "C" + assert set(loaded_newbis.evidence()) == set(["A", "B"]) assert loaded_newbis.fitted() - assert type(loaded_newbis.type()) == NewType - nn = NewFactorBis("a", []) + assert isinstance(loaded_newbis.type(), NewType) + nn = NewFactorBis("A", []) assert loaded_newbis.type() == nn.type() assert loaded_newbis.some_fit_data == "fitted" - assert type(loaded_newbis.type()) == type(loaded_new.type()) + assert isinstance(loaded_newbis.type(), type(loaded_new.type())) assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() assert loaded_lg.type() != loaded_new.type() assert loaded_ckde.type() != loaded_discrete.type() assert loaded_ckde.type() != loaded_new.type() - assert loaded_discrete.type() != loaded_new.type() \ No newline at end of file + assert loaded_discrete.type() != loaded_new.type() diff --git a/tests/serialization/serialize_factor_type_test.py b/tests/serialization/serialize_factor_type_test.py index 67f9480c..7377ceae 100644 --- a/tests/serialization/serialize_factor_type_test.py +++ b/tests/serialization/serialize_factor_type_test.py @@ -1,42 +1,56 @@ -import pytest -import pybnesian as pbn -from pybnesian import FactorType import pickle +import pybnesian as pbn +import pytest + + @pytest.fixture def lg_type_bytes(): lg = pbn.LinearGaussianCPDType() return pickle.dumps(lg) + @pytest.fixture def ckde_type_bytes(): ckde = pbn.CKDEType() return pickle.dumps(ckde) + @pytest.fixture def discrete_type_bytes(): discrete = pbn.DiscreteFactorType() return pickle.dumps(discrete) -class NewType(FactorType): + +class NewType(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) + -class OtherType(FactorType): +class OtherType(pbn.FactorType): def __init__(self): - FactorType.__init__(self) + pbn.FactorType.__init__(self) + @pytest.fixture def new_type_bytes(): n = NewType() return pickle.dumps(n) + @pytest.fixture def other_type_bytes(): o = OtherType() return pickle.dumps(o) -def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type_bytes, new_type_bytes, other_type_bytes): + +def test_serialization_factor_type( + lg_type_bytes, + ckde_type_bytes, + discrete_type_bytes, + new_type_bytes, + other_type_bytes, +): loaded_lg = pickle.loads(lg_type_bytes) new_lg = pbn.LinearGaussianCPDType() assert new_lg == loaded_lg @@ -66,4 +80,4 @@ def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type assert new_ckde != new_other assert new_discrete != new_new assert new_discrete != new_other - assert new_new != new_other \ No newline at end of file + assert new_new != new_other diff --git a/tests/serialization/serialize_models_test.py b/tests/serialization/serialize_models_test.py index a70e2cb3..eb271e83 100644 --- a/tests/serialization/serialize_models_test.py +++ b/tests/serialization/serialize_models_test.py @@ -1,34 +1,40 @@ -import pytest +import pickle + import pyarrow as pa import pybnesian as pbn -from pybnesian import BayesianNetworkType, BayesianNetwork, ConditionalBayesianNetwork, GaussianNetwork,\ - SemiparametricBN, KDENetwork, DiscreteBN, LinearGaussianCPD, CKDE, DiscreteFactor -import pickle -import util_test +import pytest +from helpers.data import generate_discrete_data, generate_normal_data_independent + @pytest.fixture def gaussian_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(gaussian) + @pytest.fixture def spbn_bytes(): - spbn = SemiparametricBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.CKDEType())]) + spbn = pbn.SemiparametricBN( + ["A", "B", "C", "D"], [("A", "B")], [("B", pbn.CKDEType())] + ) return pickle.dumps(spbn) + @pytest.fixture def kde_bytes(): - kde = KDENetwork(["a", "b", "c", "d"], [("a", "b")]) + kde = pbn.KDENetwork(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(kde) + @pytest.fixture def discrete_bytes(): - discrete = DiscreteBN(["a", "b", "c", "d"], [("a", "b")]) + discrete = pbn.DiscreteBN(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(discrete) -class MyRestrictedGaussianNetworkType(BayesianNetworkType): + +class MyRestrictedGaussianNetworkType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) + pbn.BayesianNetworkType.__init__(self) def is_homogeneous(self): return True @@ -37,7 +43,7 @@ def default_node_type(self): return pbn.LinearGaussianCPDType() def can_have_arc(self, model, source, target): - return "a" in source + return "A" in source def new_bn(self, nodes): return NewBN(nodes) @@ -48,27 +54,37 @@ def new_cbn(self, nodes, interface_nodes): def __str__(self): return "MyRestrictedGaussianNetworkType" + @pytest.fixture def genericbn_bytes(): - gen = BayesianNetwork(MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], [("a", "b")]) + gen = pbn.BayesianNetwork( + MyRestrictedGaussianNetworkType(), ["A", "B", "C", "D"], [("A", "B")] + ) return pickle.dumps(gen) -class NewBN(BayesianNetwork): + +class NewBN(pbn.BayesianNetwork): def __init__(self, variables, arcs=None): if arcs is None: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables + ) else: - BayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, arcs) + pbn.BayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, arcs + ) + @pytest.fixture def newbn_bytes(): - new = NewBN(["a", "b", "c", "d"], [("a", "b")]) + new = NewBN(["A", "B", "C", "D"], [("A", "B")]) return pickle.dumps(new) -class NonHomogeneousType(BayesianNetworkType): + +class NonHomogeneousType(pbn.BayesianNetworkType): def __init__(self): - BayesianNetworkType.__init__(self) - + pbn.BayesianNetworkType.__init__(self) + def is_homogeneous(self): return False @@ -88,18 +104,24 @@ def __str__(self): return "NonHomogeneousType" -class OtherBN(BayesianNetwork): +class OtherBN(pbn.BayesianNetwork): def __init__(self, variables, arcs=None, node_types=None): if arcs is None: if node_types is None: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables) + pbn.BayesianNetwork.__init__(self, NonHomogeneousType(), variables) else: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, node_types) + pbn.BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, node_types + ) else: if node_types is None: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, arcs) + pbn.BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, arcs + ) else: - BayesianNetwork.__init__(self, NonHomogeneousType(), variables, arcs, node_types) + pbn.BayesianNetwork.__init__( + self, NonHomogeneousType(), variables, arcs, node_types + ) self.extra_info = "extra" @@ -109,120 +131,161 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def otherbn_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = OtherBN( + ["A", "B", "C", "D"], + [("A", "B")], + [ + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), + ], + ) return pickle.dumps(other) -def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_bytes, genericbn_bytes, newbn_bytes, otherbn_bytes): + +def test_serialization_bn_model( + gaussian_bytes, + spbn_bytes, + kde_bytes, + discrete_bytes, + genericbn_bytes, + newbn_bytes, + otherbn_bytes, +): loaded_g = pickle.loads(gaussian_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_g.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_g.arcs() == [("A", "B")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(spbn_bytes) - assert set(loaded_s.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_s.arcs() == [("a", "b")] + assert set(loaded_s.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_s.arcs() == [("A", "B")] assert loaded_s.type() == pbn.SemiparametricBNType() - assert loaded_s.node_types() == {'a': pbn.UnknownFactorType(), - 'b': pbn.CKDEType(), - 'c': pbn.UnknownFactorType(), - 'd': pbn.UnknownFactorType()} + assert loaded_s.node_types() == { + "A": pbn.UnknownFactorType(), + "B": pbn.CKDEType(), + "C": pbn.UnknownFactorType(), + "D": pbn.UnknownFactorType(), + } loaded_k = pickle.loads(kde_bytes) - assert set(loaded_k.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_k.arcs() == [("a", "b")] + assert set(loaded_k.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_k.arcs() == [("A", "B")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(discrete_bytes) - assert set(loaded_d.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_d.arcs() == [("a", "b")] + assert set(loaded_d.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_d.arcs() == [("A", "B")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(genericbn_bytes) - assert set(loaded_gen.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_gen.arcs() == [("a", "b")] + assert set(loaded_gen.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_gen.arcs() == [("A", "B")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(newbn_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_nn.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_nn.arcs() == [("A", "B")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(otherbn_bytes) - assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) - assert loaded_o.arcs() == [("a", "b")] + assert set(loaded_g.nodes()) == set(["A", "B", "C", "D"]) + assert loaded_o.arcs() == [("A", "B")] assert loaded_o.type() == NonHomogeneousType() - assert loaded_o.node_types() == {'a': pbn.UnknownFactorType(), - 'b': pbn.LinearGaussianCPDType(), - 'c': pbn.CKDEType(), - 'd': pbn.DiscreteFactorType()} + assert loaded_o.node_types() == { + "A": pbn.UnknownFactorType(), + "B": pbn.LinearGaussianCPDType(), + "C": pbn.CKDEType(), + "D": pbn.DiscreteFactorType(), + } assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type() + @pytest.fixture def gaussian_partial_fit_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) - lg = pbn.LinearGaussianCPD("b", ["a"], [1, 2], 2) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def gaussian_fit_bytes(): - gaussian = GaussianNetwork(["a", "b", "c", "d"], [("a", "b")]) - lg_a = LinearGaussianCPD("a", [], [0], 0.5) - lg_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) - lg_c = LinearGaussianCPD("c", [], [2], 1) - lg_d = LinearGaussianCPD("d", [], [3], 1.5) + gaussian = pbn.GaussianNetwork(["A", "B", "C", "D"], [("A", "B")]) + lg_a = pbn.LinearGaussianCPD("A", [], [0], 0.5) + lg_b = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) + lg_c = pbn.LinearGaussianCPD("C", [], [2], 1) + lg_d = pbn.LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_a, lg_b, lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def other_partial_fit_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) - lg = LinearGaussianCPD("b", ["a"], [1, 2], 2) + other = OtherBN( + ["A", "B", "C", "D"], + [("A", "B")], + [ + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), + ], + ) + lg = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) + @pytest.fixture def other_fit_bytes(): - other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) - cpd_a = LinearGaussianCPD("a", [], [0], 0.5) - cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) - - df_continuous = util_test.generate_normal_data_indep(100) - cpd_c = CKDE("c", []) + other = OtherBN( + ["A", "B", "C", "D"], + [("A", "B")], + [ + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), + ], + ) + cpd_a = pbn.LinearGaussianCPD("A", [], [0], 0.5) + cpd_b = pbn.LinearGaussianCPD("B", ["A"], [1, 2], 2) + + df_continuous = generate_normal_data_independent(100) + cpd_c = pbn.CKDE("C", []) cpd_c.fit(df_continuous) - df_discrete = util_test.generate_discrete_data_dependent(100) - df_discrete.columns = df_discrete.columns.str.lower() - cpd_d = DiscreteFactor("d", []) + df_discrete = generate_discrete_data(100) + cpd_d = pbn.DiscreteFactor("D", []) cpd_d.fit(df_discrete) - + other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) other.include_cpd = True return pickle.dumps(other) -def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, other_partial_fit_bytes, other_fit_bytes): + +def test_serialization_fitted_bn( + gaussian_partial_fit_bytes, + gaussian_fit_bytes, + other_partial_fit_bytes, + other_fit_bytes, +): # #################### # Gaussian partial fit # #################### loaded_partial = pickle.loads(gaussian_partial_fit_bytes) assert not loaded_partial.fitted() - cpd = loaded_partial.cpd("b") - assert cpd.variable() == "b" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("B") + assert cpd.variable() == "B" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -232,26 +295,26 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, loaded_fitted = pickle.loads(gaussian_fit_bytes) assert loaded_fitted.fitted() - cpd_a = loaded_fitted.cpd("a") - assert cpd_a.variable() == "a" + cpd_a = loaded_fitted.cpd("A") + assert cpd_a.variable() == "A" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 - cpd_b = loaded_fitted.cpd("b") - assert cpd_b.variable() == "b" - assert cpd_b.evidence() == ["a"] + cpd_b = loaded_fitted.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 - cpd_c = loaded_fitted.cpd("c") - assert cpd_c.variable() == "c" + cpd_c = loaded_fitted.cpd("C") + assert cpd_c.variable() == "C" assert cpd_c.evidence() == [] assert cpd_c.beta == [2] assert cpd_c.variance == 1 - - cpd_d = loaded_fitted.cpd("d") - assert cpd_d.variable() == "d" + + cpd_d = loaded_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.beta == [3] assert cpd_d.variance == 1.5 @@ -261,9 +324,9 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, # #################### loaded_other = pickle.loads(other_partial_fit_bytes) assert not loaded_other.fitted() - cpd = loaded_partial.cpd("b") - assert cpd.variable() == "b" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("B") + assert cpd.variable() == "B" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -273,29 +336,29 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, loaded_other_fitted = pickle.loads(other_fit_bytes) assert loaded_other_fitted.fitted() - cpd_a = loaded_other_fitted.cpd("a") - assert cpd_a.variable() == "a" + cpd_a = loaded_other_fitted.cpd("A") + assert cpd_a.variable() == "A" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 assert cpd_a.type() == pbn.LinearGaussianCPDType() - cpd_b = loaded_other_fitted.cpd("b") - assert cpd_b.variable() == "b" - assert cpd_b.evidence() == ["a"] + cpd_b = loaded_other_fitted.cpd("B") + assert cpd_b.variable() == "B" + assert cpd_b.evidence() == ["A"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 assert cpd_b.type() == pbn.LinearGaussianCPDType() - cpd_c = loaded_other_fitted.cpd("c") - assert cpd_c.variable() == "c" + cpd_c = loaded_other_fitted.cpd("C") + assert cpd_c.variable() == "C" assert cpd_c.evidence() == [] assert cpd_c.fitted() assert cpd_c.num_instances() == 100 assert cpd_c.type() == pbn.CKDEType() - cpd_d = loaded_other_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_other_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.fitted() assert cpd_d.type() == pbn.DiscreteFactorType() @@ -305,55 +368,79 @@ def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, # Conditional BN # ########################## + @pytest.fixture def cond_gaussian_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(gaussian) + @pytest.fixture def cond_spbn_bytes(): - spbn = pbn.ConditionalSemiparametricBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType())]) + spbn = pbn.ConditionalSemiparametricBN( + ["C", "D"], ["A", "B"], [("A", "C")], [("C", pbn.CKDEType())] + ) return pickle.dumps(spbn) + @pytest.fixture def cond_kde_bytes(): - kde = pbn.ConditionalKDENetwork(["c", "d"], ["a", "b"], [("a", "c")]) + kde = pbn.ConditionalKDENetwork(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(kde) + @pytest.fixture def cond_discrete_bytes(): - discrete = pbn.ConditionalDiscreteBN(["c", "d"], ["a", "b"], [("a", "c")]) + discrete = pbn.ConditionalDiscreteBN(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(discrete) + @pytest.fixture def cond_genericbn_bytes(): - gen = ConditionalBayesianNetwork(MyRestrictedGaussianNetworkType(), ["c", "d"], ["a", "b"], [("a", "c")]) + gen = pbn.ConditionalBayesianNetwork( + MyRestrictedGaussianNetworkType(), ["C", "D"], ["A", "B"], [("A", "C")] + ) return pickle.dumps(gen) -class ConditionalNewBN(ConditionalBayesianNetwork): + +class ConditionalNewBN(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None): if arcs is None: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface) + pbn.ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, interface, arcs) + pbn.ConditionalBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, interface, arcs + ) + @pytest.fixture def cond_newbn_bytes(): - new = ConditionalNewBN(["c", "d"], ["a", "b"], [("a", "c")]) + new = ConditionalNewBN(["C", "D"], ["A", "B"], [("A", "C")]) return pickle.dumps(new) -class ConditionalOtherBN(ConditionalBayesianNetwork): + +class ConditionalOtherBN(pbn.ConditionalBayesianNetwork): def __init__(self, variables, interface, arcs=None, node_types=None): if arcs is None: if node_types is None: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface) + pbn.ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface + ) else: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, node_types) + pbn.ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, node_types + ) else: if node_types is None: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, arcs) + pbn.ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, arcs + ) else: - ConditionalBayesianNetwork.__init__(self, NonHomogeneousType(), variables, interface, arcs, node_types) + pbn.ConditionalBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, interface, arcs, node_types + ) self.extra_info = "extra" @@ -363,64 +450,76 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def cond_otherbn_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("b", pbn.LinearGaussianCPDType()), - ("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) + other = ConditionalOtherBN( + ["C", "D"], + ["A", "B"], + [("A", "C")], + [ + ("B", pbn.LinearGaussianCPDType()), + ("C", pbn.CKDEType()), + ("D", pbn.DiscreteFactorType()), + ], + ) return pickle.dumps(other) - -def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes, cond_kde_bytes, - cond_discrete_bytes, cond_genericbn_bytes, - cond_newbn_bytes, cond_otherbn_bytes, - newbn_bytes, otherbn_bytes): +def test_serialization_conditional_bn_model( + cond_gaussian_bytes, + cond_spbn_bytes, + cond_kde_bytes, + cond_discrete_bytes, + cond_genericbn_bytes, + cond_newbn_bytes, + cond_otherbn_bytes, + newbn_bytes, + otherbn_bytes, +): loaded_g = pickle.loads(cond_gaussian_bytes) - assert set(loaded_g.nodes()) == set(["c", "d"]) - assert set(loaded_g.interface_nodes()) == set(["a", "b"]) - assert loaded_g.arcs() == [("a", "c")] + assert set(loaded_g.nodes()) == set(["C", "D"]) + assert set(loaded_g.interface_nodes()) == set(["A", "B"]) + assert loaded_g.arcs() == [("A", "C")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(cond_spbn_bytes) - assert set(loaded_s.nodes()) == set(["c", "d"]) - assert set(loaded_s.interface_nodes()) == set(["a", "b"]) - assert loaded_s.arcs() == [("a", "c")] + assert set(loaded_s.nodes()) == set(["C", "D"]) + assert set(loaded_s.interface_nodes()) == set(["A", "B"]) + assert loaded_s.arcs() == [("A", "C")] assert loaded_s.type() == pbn.SemiparametricBNType() - assert loaded_s.node_types() == {'c': pbn.CKDEType(), - 'd': pbn.UnknownFactorType()} + assert loaded_s.node_types() == {"C": pbn.CKDEType(), "D": pbn.UnknownFactorType()} loaded_k = pickle.loads(cond_kde_bytes) - assert set(loaded_k.nodes()) == set(["c", "d"]) - assert set(loaded_k.interface_nodes()) == set(["a", "b"]) - assert loaded_k.arcs() == [("a", "c")] + assert set(loaded_k.nodes()) == set(["C", "D"]) + assert set(loaded_k.interface_nodes()) == set(["A", "B"]) + assert loaded_k.arcs() == [("A", "C")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(cond_discrete_bytes) - assert set(loaded_d.nodes()) == set(["c", "d"]) - assert set(loaded_d.interface_nodes()) == set(["a", "b"]) - assert loaded_d.arcs() == [("a", "c")] + assert set(loaded_d.nodes()) == set(["C", "D"]) + assert set(loaded_d.interface_nodes()) == set(["A", "B"]) + assert loaded_d.arcs() == [("A", "C")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(cond_genericbn_bytes) - assert set(loaded_gen.nodes()) == set(["c", "d"]) - assert set(loaded_gen.interface_nodes()) == set(["a", "b"]) - assert loaded_gen.arcs() == [("a", "c")] + assert set(loaded_gen.nodes()) == set(["C", "D"]) + assert set(loaded_gen.interface_nodes()) == set(["A", "B"]) + assert loaded_gen.arcs() == [("A", "C")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(cond_newbn_bytes) - assert set(loaded_nn.nodes()) == set(["c", "d"]) - assert set(loaded_nn.interface_nodes()) == set(["a", "b"]) - assert loaded_nn.arcs() == [("a", "c")] + assert set(loaded_nn.nodes()) == set(["C", "D"]) + assert set(loaded_nn.interface_nodes()) == set(["A", "B"]) + assert loaded_nn.arcs() == [("A", "C")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(cond_otherbn_bytes) - assert set(loaded_o.nodes()) == set(["c", "d"]) - assert set(loaded_o.interface_nodes()) == set(["a", "b"]) - assert loaded_o.arcs() == [("a", "c")] + assert set(loaded_o.nodes()) == set(["C", "D"]) + assert set(loaded_o.interface_nodes()) == set(["A", "B"]) + assert loaded_o.arcs() == [("A", "C")] assert loaded_o.type() == NonHomogeneousType() - assert loaded_o.node_types() == {'c': pbn.CKDEType(), - 'd': pbn.DiscreteFactorType()} + assert loaded_o.node_types() == {"C": pbn.CKDEType(), "D": pbn.DiscreteFactorType()} assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type() @@ -431,62 +530,78 @@ def test_serialization_conditional_bn_model(cond_gaussian_bytes, cond_spbn_bytes assert loaded_nn.type() == loaded_unconditional_nn.type() assert loaded_o.type() == loaded_unconditional_o.type() + @pytest.fixture def cond_gaussian_partial_fit_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) - lg = LinearGaussianCPD("c", ["a"], [1, 2], 2) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) + lg = pbn.LinearGaussianCPD("C", ["A"], [1, 2], 2) gaussian.add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def cond_gaussian_fit_bytes(): - gaussian = pbn.ConditionalGaussianNetwork(["c", "d"], ["a", "b"], [("a", "c")]) - lg_c = LinearGaussianCPD("c", ["a"], [1, 2], 2) - lg_d = LinearGaussianCPD("d", [], [3], 1.5) + gaussian = pbn.ConditionalGaussianNetwork(["C", "D"], ["A", "B"], [("A", "C")]) + lg_c = pbn.LinearGaussianCPD("C", ["A"], [1, 2], 2) + lg_d = pbn.LinearGaussianCPD("D", [], [3], 1.5) gaussian.add_cpds([lg_c, lg_d]) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def cond_other_partial_fit_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), - ("d", pbn.LinearGaussianCPDType())]) - lg = LinearGaussianCPD("d", [], [3], 1.5) + other = ConditionalOtherBN( + ["C", "D"], + ["A", "B"], + [("A", "C")], + [("C", pbn.CKDEType()), ("D", pbn.LinearGaussianCPDType())], + ) + lg = pbn.LinearGaussianCPD("D", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other) + @pytest.fixture def cond_other_fit_bytes(): - other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), - ("d", pbn.DiscreteFactorType())]) - cpd_c = CKDE("c", ["a"]) - cpd_d = DiscreteFactor("d", []) - - df_continuous = util_test.generate_normal_data_indep(100) + other = ConditionalOtherBN( + ["C", "D"], + ["A", "B"], + [("A", "C")], + [("C", pbn.CKDEType()), ("D", pbn.DiscreteFactorType())], + ) + cpd_c = pbn.CKDE("C", ["A"]) + cpd_d = pbn.DiscreteFactor("D", []) + + df_continuous = generate_normal_data_independent(100) cpd_c.fit(df_continuous) - df_discrete = util_test.generate_discrete_data_dependent(100) - df_discrete.columns = df_discrete.columns.str.lower() - cpd_d = DiscreteFactor("d", []) + df_discrete = generate_discrete_data(100) + cpd_d = pbn.DiscreteFactor("D", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_c, cpd_d]) - + other.include_cpd = True return pickle.dumps(other) -def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, cond_gaussian_fit_bytes, - cond_other_partial_fit_bytes, cond_other_fit_bytes): + +def test_serialization_fitted_conditional_bn( + cond_gaussian_partial_fit_bytes, + cond_gaussian_fit_bytes, + cond_other_partial_fit_bytes, + cond_other_fit_bytes, +): # #################### # Gaussian partial fit # #################### loaded_partial = pickle.loads(cond_gaussian_partial_fit_bytes) assert not loaded_partial.fitted() - cpd = loaded_partial.cpd("c") - assert cpd.variable() == "c" - assert cpd.evidence() == ["a"] + cpd = loaded_partial.cpd("C") + assert cpd.variable() == "C" + assert cpd.evidence() == ["A"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 @@ -496,14 +611,14 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co loaded_fitted = pickle.loads(cond_gaussian_fit_bytes) assert loaded_fitted.fitted() - cpd_c = loaded_fitted.cpd("c") - assert cpd_c.variable() == "c" - assert cpd_c.evidence() == ["a"] + cpd_c = loaded_fitted.cpd("C") + assert cpd_c.variable() == "C" + assert cpd_c.evidence() == ["A"] assert list(cpd_c.beta) == [1, 2] assert cpd_c.variance == 2 - - cpd_d = loaded_fitted.cpd("d") - assert cpd_d.variable() == "d" + + cpd_d = loaded_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.beta == [3] assert cpd_d.variance == 1.5 @@ -513,8 +628,8 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co # #################### loaded_other = pickle.loads(cond_other_partial_fit_bytes) assert not loaded_other.fitted() - cpd = loaded_other.cpd("d") - assert cpd.variable() == "d" + cpd = loaded_other.cpd("D") + assert cpd.variable() == "D" assert cpd.evidence() == [] assert cpd.beta == [3] assert cpd.variance == 1.5 @@ -525,15 +640,15 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co loaded_other_fitted = pickle.loads(cond_other_fit_bytes) assert loaded_other_fitted.fitted() - cpd_c = loaded_other_fitted.cpd("c") - assert cpd_c.variable() == "c" - assert cpd_c.evidence() == ["a"] + cpd_c = loaded_other_fitted.cpd("C") + assert cpd_c.variable() == "C" + assert cpd_c.evidence() == ["A"] assert cpd_c.fitted() assert cpd_c.num_instances() == 100 assert cpd_c.type() == pbn.CKDEType() - cpd_d = loaded_other_fitted.cpd("d") - assert cpd_d.variable() == "d" + cpd_d = loaded_other_fitted.cpd("D") + assert cpd_d.variable() == "D" assert cpd_d.evidence() == [] assert cpd_d.fitted() assert cpd_d.type() == pbn.DiscreteFactorType() @@ -541,56 +656,72 @@ def test_serialization_fitted_conditional_bn(cond_gaussian_partial_fit_bytes, co assert loaded_other_fitted.extra_info == "extra" assert loaded_other.type() == loaded_other_fitted.type() + # ########################## # Dynamic BN # ########################## + @pytest.fixture def dyn_gaussian_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(gaussian) + @pytest.fixture def dyn_spbn_bytes(): - spbn = pbn.DynamicSemiparametricBN(["a", "b", "c", "d"], 2) - spbn.static_bn().add_arc("a_t_2", "d_t_1") - spbn.transition_bn().add_arc("c_t_2", "b_t_0") - spbn.transition_bn().set_node_type("b_t_0", pbn.CKDEType()) + spbn = pbn.DynamicSemiparametricBN(["A", "B", "C", "D"], 2) + spbn.static_bn().add_arc("A_t_2", "D_t_1") + spbn.transition_bn().add_arc("C_t_2", "B_t_0") + spbn.transition_bn().set_node_type("B_t_0", pbn.CKDEType()) return pickle.dumps(spbn) + @pytest.fixture def dyn_kde_bytes(): - kde = pbn.DynamicKDENetwork(["a", "b", "c", "d"], 2) - kde.static_bn().add_arc("a_t_2", "d_t_1") - kde.transition_bn().add_arc("c_t_2", "b_t_0") + kde = pbn.DynamicKDENetwork(["A", "B", "C", "D"], 2) + kde.static_bn().add_arc("A_t_2", "D_t_1") + kde.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(kde) + @pytest.fixture def dyn_discrete_bytes(): - discrete = pbn.DynamicDiscreteBN(["a", "b", "c", "d"], 2) - discrete.static_bn().add_arc("a_t_2", "d_t_1") - discrete.transition_bn().add_arc("c_t_2", "b_t_0") + discrete = pbn.DynamicDiscreteBN(["A", "B", "C", "D"], 2) + discrete.static_bn().add_arc("A_t_2", "D_t_1") + discrete.transition_bn().add_arc("C_t_2", "B_t_0") return pickle.dumps(discrete) + @pytest.fixture def dyn_genericbn_bytes(): - gen = pbn.DynamicBayesianNetwork(MyRestrictedGaussianNetworkType(), ["a", "b", "c", "d"], 2) - gen.static_bn().add_arc("a_t_2", "d_t_1") - gen.transition_bn().add_arc("a_t_2", "b_t_0") + gen = pbn.DynamicBayesianNetwork( + MyRestrictedGaussianNetworkType(), ["A", "B", "C", "D"], 2 + ) + gen.static_bn().add_arc("A_t_2", "D_t_1") + gen.transition_bn().add_arc("A_t_2", "B_t_0") return pickle.dumps(gen) + class DynamicNewBN(pbn.DynamicBayesianNetwork): def __init__(self, variables, markovian_order): - pbn.DynamicBayesianNetwork.__init__(self, MyRestrictedGaussianNetworkType(), variables, markovian_order) + pbn.DynamicBayesianNetwork.__init__( + self, MyRestrictedGaussianNetworkType(), variables, markovian_order + ) + class DynamicOtherBN(pbn.DynamicBayesianNetwork): def __init__(self, variables, markovian_order, static_bn=None, transition_bn=None): if static_bn is None or transition_bn is None: - pbn.DynamicBayesianNetwork.__init__(self, NonHomogeneousType(), variables, markovian_order) + pbn.DynamicBayesianNetwork.__init__( + self, NonHomogeneousType(), variables, markovian_order + ) else: - pbn.DynamicBayesianNetwork.__init__(self, variables, markovian_order, static_bn, transition_bn) + pbn.DynamicBayesianNetwork.__init__( + self, variables, markovian_order, static_bn, transition_bn + ) self.extra_info = "extra" def __getstate_extra__(self): @@ -599,117 +730,140 @@ def __getstate_extra__(self): def __setstate_extra__(self, t): self.extra_info = t + @pytest.fixture def dyn_newbn_bytes(): - new = DynamicNewBN(["a", "b", "c", "d"], 2) - new.static_bn().add_arc("a_t_2", "d_t_1") - new.transition_bn().add_arc("a_t_2", "b_t_0") + new = DynamicNewBN(["A", "B", "C", "D"], 2) + new.static_bn().add_arc("A_t_2", "D_t_1") + new.transition_bn().add_arc("A_t_2", "B_t_0") return pickle.dumps(new) + @pytest.fixture def dyn_otherbn_bytes(): - other = DynamicOtherBN(["a", "b", "c", "d"], 2) - other.static_bn().add_arc("a_t_2", "d_t_1") - other.static_bn().set_node_type("c_t_1", pbn.DiscreteFactorType()) - other.static_bn().set_node_type("d_t_1", pbn.CKDEType()) + other = DynamicOtherBN(["A", "B", "C", "D"], 2) + other.static_bn().add_arc("A_t_2", "D_t_1") + other.static_bn().set_node_type("C_t_1", pbn.DiscreteFactorType()) + other.static_bn().set_node_type("D_t_1", pbn.CKDEType()) - other.transition_bn().add_arc("a_t_2", "b_t_0") - other.transition_bn().set_node_type("d_t_0", pbn.CKDEType()) + other.transition_bn().add_arc("A_t_2", "B_t_0") + other.transition_bn().set_node_type("D_t_0", pbn.CKDEType()) return pickle.dumps(other) -def test_serialization_dbn_model(dyn_gaussian_bytes, dyn_spbn_bytes, dyn_kde_bytes, dyn_discrete_bytes, - dyn_genericbn_bytes, dyn_newbn_bytes, dyn_otherbn_bytes): + +def test_serialization_dbn_model( + dyn_gaussian_bytes, + dyn_spbn_bytes, + dyn_kde_bytes, + dyn_discrete_bytes, + dyn_genericbn_bytes, + dyn_newbn_bytes, + dyn_otherbn_bytes, +): loaded_g = pickle.loads(dyn_gaussian_bytes) - assert set(loaded_g.variables()) == set(["a", "b", "c", "d"]) - assert loaded_g.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_g.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_g.variables()) == set(["A", "B", "C", "D"]) + assert loaded_g.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_g.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(dyn_spbn_bytes) - assert set(loaded_s.variables()) == set(["a", "b", "c", "d"]) - assert loaded_s.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_s.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_s.variables()) == set(["A", "B", "C", "D"]) + assert loaded_s.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_s.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_s.type() == pbn.SemiparametricBNType() node_types = {v + "_t_0": pbn.UnknownFactorType() for v in loaded_s.variables()} - node_types["b_t_0"] = pbn.CKDEType() + node_types["B_t_0"] = pbn.CKDEType() assert loaded_s.transition_bn().node_types() == node_types loaded_k = pickle.loads(dyn_kde_bytes) - assert set(loaded_k.variables()) == set(["a", "b", "c", "d"]) - assert loaded_k.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_k.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_k.variables()) == set(["A", "B", "C", "D"]) + assert loaded_k.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_k.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(dyn_discrete_bytes) - assert set(loaded_d.variables()) == set(["a", "b", "c", "d"]) - assert loaded_d.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_d.transition_bn().arcs() == [("c_t_2", "b_t_0")] + assert set(loaded_d.variables()) == set(["A", "B", "C", "D"]) + assert loaded_d.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_d.transition_bn().arcs() == [("C_t_2", "B_t_0")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(dyn_genericbn_bytes) - assert set(loaded_gen.variables()) == set(["a", "b", "c", "d"]) - assert loaded_gen.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_gen.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_gen.variables()) == set(["A", "B", "C", "D"]) + assert loaded_gen.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_gen.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(dyn_newbn_bytes) - assert set(loaded_nn.variables()) == set(["a", "b", "c", "d"]) - assert loaded_nn.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_nn.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_nn.variables()) == set(["A", "B", "C", "D"]) + assert loaded_nn.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_nn.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_other = pickle.loads(dyn_otherbn_bytes) - assert set(loaded_other.variables()) == set(["a", "b", "c", "d"]) - assert loaded_other.static_bn().arcs() == [("a_t_2", "d_t_1")] - assert loaded_other.transition_bn().arcs() == [("a_t_2", "b_t_0")] + assert set(loaded_other.variables()) == set(["A", "B", "C", "D"]) + assert loaded_other.static_bn().arcs() == [("A_t_2", "D_t_1")] + assert loaded_other.transition_bn().arcs() == [("A_t_2", "B_t_0")] assert loaded_other.type() == NonHomogeneousType() assert loaded_other.extra_info == "extra" - assert loaded_other.static_bn().node_type("c_t_1") == pbn.DiscreteFactorType() - assert loaded_other.static_bn().node_type("d_t_1") == pbn.CKDEType() - assert loaded_other.transition_bn().node_type("d_t_0") == pbn.CKDEType() + assert loaded_other.static_bn().node_type("C_t_1") == pbn.DiscreteFactorType() + assert loaded_other.static_bn().node_type("D_t_1") == pbn.CKDEType() + assert loaded_other.transition_bn().node_type("D_t_0") == pbn.CKDEType() + @pytest.fixture def dyn_gaussian_partial_fit_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) gaussian.static_bn().add_cpds([lg]) - lg = LinearGaussianCPD("b_t_0", ["c_t_2"], [3, 4], 5) + lg = pbn.LinearGaussianCPD("B_t_0", ["C_t_2"], [3, 4], 5) gaussian.transition_bn().add_cpds([lg]) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def dyn_gaussian_fit_bytes(): - gaussian = pbn.DynamicGaussianNetwork(["a", "b", "c", "d"], 2) - gaussian.static_bn().add_arc("a_t_2", "d_t_1") - gaussian.transition_bn().add_arc("c_t_2", "b_t_0") - df = util_test.generate_normal_data_indep(1000) + gaussian = pbn.DynamicGaussianNetwork(["A", "B", "C", "D"], 2) + gaussian.static_bn().add_arc("A_t_2", "D_t_1") + gaussian.transition_bn().add_arc("C_t_2", "B_t_0") + df = generate_normal_data_independent(1000) gaussian.fit(df) gaussian.include_cpd = True return pickle.dumps(gaussian) + @pytest.fixture def dyn_other_partial_fit_bytes(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] - other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType())]) - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + other_static = OtherBN( + static_nodes, + [("A_t_2", "D_t_1")], + [ + ("B_t_1", pbn.DiscreteFactorType()), + ("C_t_1", pbn.CKDEType()), + ("D_t_1", pbn.LinearGaussianCPDType()), + ], + ) + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) - other_transition = ConditionalOtherBN(transition_nodes, - static_nodes, - [("a_t_2", "d_t_0")], - [("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType())]) - lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) + other_transition = ConditionalOtherBN( + transition_nodes, + static_nodes, + [("A_t_2", "D_t_0")], + [ + ("B_t_0", pbn.DiscreteFactorType()), + ("C_t_0", pbn.CKDEType()), + ("D_t_0", pbn.LinearGaussianCPDType()), + ], + ) + lg = pbn.LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() @@ -718,41 +872,57 @@ def dyn_other_partial_fit_bytes(): dyn_other.include_cpd = True return pickle.dumps(dyn_other) + @pytest.fixture def dyn_other_fit_bytes(): - variables = ["a", "b", "c", "d"] + variables = ["A", "B", "C", "D"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] - other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_2", pbn.DiscreteFactorType()), - ("b_t_1", pbn.DiscreteFactorType()), - ("c_t_1", pbn.CKDEType()), - ("d_t_1", pbn.LinearGaussianCPDType())]) - lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) + other_static = OtherBN( + static_nodes, + [("A_t_2", "D_t_1")], + [ + ("B_t_2", pbn.DiscreteFactorType()), + ("B_t_1", pbn.DiscreteFactorType()), + ("C_t_1", pbn.CKDEType()), + ("D_t_1", pbn.LinearGaussianCPDType()), + ], + ) + lg = pbn.LinearGaussianCPD("D_t_1", ["A_t_2"], [1, 2], 2) other_static.add_cpds([lg]) - other_transition = ConditionalOtherBN(transition_nodes, - static_nodes, - [("a_t_2", "d_t_0")], - [("b_t_0", pbn.DiscreteFactorType()), - ("c_t_0", pbn.CKDEType()), - ("d_t_0", pbn.LinearGaussianCPDType())]) - lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) + other_transition = ConditionalOtherBN( + transition_nodes, + static_nodes, + [("A_t_2", "D_t_0")], + [ + ("B_t_0", pbn.DiscreteFactorType()), + ("C_t_0", pbn.CKDEType()), + ("D_t_0", pbn.LinearGaussianCPDType()), + ], + ) + lg = pbn.LinearGaussianCPD("D_t_0", ["A_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) - df_continuous = util_test.generate_normal_data_indep(1000) - df_discrete = util_test.generate_discrete_data_dependent(1000) + df_continuous = generate_normal_data_independent(1000) + df_discrete = generate_discrete_data(1000) df = df_continuous - df["b"] = df_discrete["B"] + df["B"] = df_discrete["B"] dyn_other.fit(df) dyn_other.include_cpd = True return pickle.dumps(dyn_other) -def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_fit_bytes, - dyn_other_partial_fit_bytes, dyn_other_fit_bytes): + +def test_serialization_fitted_dbn( + dyn_gaussian_partial_fit_bytes, + dyn_gaussian_fit_bytes, + dyn_other_partial_fit_bytes, + dyn_other_fit_bytes, +): # #################### # Gaussian partial fit # #################### @@ -760,15 +930,15 @@ def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_f assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("b_t_0") - assert cpd.variable() == "b_t_0" - assert cpd.evidence() == ["c_t_2"] + cpd = loaded_partial.transition_bn().cpd("B_t_0") + assert cpd.variable() == "B_t_0" + assert cpd.evidence() == ["C_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 5 @@ -787,23 +957,25 @@ def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_f assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() - assert loaded_partial.static_bn().node_type("b_t_1") == pbn.DiscreteFactorType() - assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() - assert loaded_partial.static_bn().node_type("d_t_1") == pbn.LinearGaussianCPDType() - - assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() - assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() - assert loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() - - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + assert loaded_partial.static_bn().node_type("B_t_1") == pbn.DiscreteFactorType() + assert loaded_partial.static_bn().node_type("C_t_1") == pbn.CKDEType() + assert loaded_partial.static_bn().node_type("D_t_1") == pbn.LinearGaussianCPDType() + + assert loaded_partial.transition_bn().node_type("B_t_0") == pbn.DiscreteFactorType() + assert loaded_partial.transition_bn().node_type("C_t_0") == pbn.CKDEType() + assert ( + loaded_partial.transition_bn().node_type("D_t_0") == pbn.LinearGaussianCPDType() + ) + + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("d_t_0") - assert cpd.variable() == "d_t_0" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.transition_bn().cpd("D_t_0") + assert cpd.variable() == "D_t_0" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 @@ -814,22 +986,24 @@ def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_f assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() - assert loaded_partial.static_bn().node_type("b_t_1") == pbn.DiscreteFactorType() - assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() - assert loaded_partial.static_bn().node_type("d_t_1") == pbn.LinearGaussianCPDType() - - assert loaded_partial.transition_bn().node_type("b_t_0") == pbn.DiscreteFactorType() - assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() - assert loaded_partial.transition_bn().node_type("d_t_0") == pbn.LinearGaussianCPDType() - - cpd = loaded_partial.static_bn().cpd("d_t_1") - assert cpd.variable() == "d_t_1" - assert cpd.evidence() == ["a_t_2"] + assert loaded_partial.static_bn().node_type("B_t_1") == pbn.DiscreteFactorType() + assert loaded_partial.static_bn().node_type("C_t_1") == pbn.CKDEType() + assert loaded_partial.static_bn().node_type("D_t_1") == pbn.LinearGaussianCPDType() + + assert loaded_partial.transition_bn().node_type("B_t_0") == pbn.DiscreteFactorType() + assert loaded_partial.transition_bn().node_type("C_t_0") == pbn.CKDEType() + assert ( + loaded_partial.transition_bn().node_type("D_t_0") == pbn.LinearGaussianCPDType() + ) + + cpd = loaded_partial.static_bn().cpd("D_t_1") + assert cpd.variable() == "D_t_1" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 - cpd = loaded_partial.transition_bn().cpd("d_t_0") - assert cpd.variable() == "d_t_0" - assert cpd.evidence() == ["a_t_2"] + cpd = loaded_partial.transition_bn().cpd("D_t_0") + assert cpd.variable() == "D_t_0" + assert cpd.evidence() == ["A_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 diff --git a/tests/serialization/serialize_models_type_test.py b/tests/serialization/serialize_models_type_test.py index 7f659108..3c2d1ca9 100644 --- a/tests/serialization/serialize_models_type_test.py +++ b/tests/serialization/serialize_models_type_test.py @@ -1,28 +1,35 @@ +import itertools +import pickle + import pytest + import pybnesian as pbn -import pickle -import itertools + @pytest.fixture def gaussian_type_bytes(): g = pbn.GaussianNetworkType() return pickle.dumps(g) + @pytest.fixture def spbn_type_bytes(): s = pbn.SemiparametricBNType() return pickle.dumps(s) + @pytest.fixture def kde_type_bytes(): k = pbn.KDENetworkType() return pickle.dumps(k) + @pytest.fixture def discrete_type_bytes(): d = pbn.DiscreteBNType() return pickle.dumps(d) + class NewBNType(pbn.BayesianNetworkType): def __init__(self): pbn.BayesianNetworkType.__init__(self) @@ -30,11 +37,13 @@ def __init__(self): def __str__(self): return "NewType" + @pytest.fixture def new_type_bytes(): nn = NewBNType() return pickle.dumps(nn) + class OtherBNType(pbn.BayesianNetworkType): def __init__(self): pbn.BayesianNetworkType.__init__(self) @@ -45,18 +54,25 @@ def __str__(self): def __getstate_extra__(self): return self.some_useful_info - + def __setstate_extra__(self, extra): self.some_useful_info = extra + @pytest.fixture def other_type_bytes(): o = OtherBNType() return pickle.dumps(o) -def test_serialization_bn_type(gaussian_type_bytes, spbn_type_bytes, kde_type_bytes, - discrete_type_bytes, new_type_bytes, other_type_bytes): +def test_serialization_bn_type( + gaussian_type_bytes, + spbn_type_bytes, + kde_type_bytes, + discrete_type_bytes, + new_type_bytes, + other_type_bytes, +): loaded_g = pickle.loads(gaussian_type_bytes) new_g = pbn.GaussianNetworkType() assert loaded_g == new_g @@ -85,4 +101,4 @@ def test_serialization_bn_type(gaussian_type_bytes, spbn_type_bytes, kde_type_by m = [loaded_g, loaded_s, loaded_k, loaded_d, loaded_nn, loaded_o] for t in itertools.combinations(m, 2): - assert t[0] != t[1] \ No newline at end of file + assert t[0] != t[1] diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json index 9439b837..b3e14c03 100644 --- a/vcpkg-configuration.json +++ b/vcpkg-configuration.json @@ -1,6 +1,6 @@ { - "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json", - "overlay-ports": [ - "./overlay_ports" - ] - } \ No newline at end of file + "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json", + "overlay-ports": [ + "./overlay_ports" + ] +} \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json index 202301b1..e25cca37 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -9,4 +9,4 @@ "boost-dynamic-bitset", "opencl" ] -} +} \ No newline at end of file