From 2724133e4e4995f0bd100ee1fe87e05430d27501 Mon Sep 17 00:00:00 2001 From: coder-jayp Date: Mon, 2 Feb 2026 23:21:54 +0530 Subject: [PATCH] Add CatBoostCvExperiment integration, tests, and __init__ update --- .gitignore | Bin 1893 -> 2166 bytes .../experiment/integrations/__init__.py | 3 + .../experiment/integrations/catboost_cv.py | 120 ++++++++++++++++++ .../test_integrations/test_catboost_cv.py | 70 ++++++++++ 4 files changed, 193 insertions(+) create mode 100644 src/hyperactive/experiment/integrations/catboost_cv.py create mode 100644 src/hyperactive/tests/test_integrations/test_catboost_cv.py diff --git a/.gitignore b/.gitignore index 3e7b58f97f5b376ff559bc70d7db8c51198ca07f..f6093d1e80b2bcb6fe70abcc7b49c266c45e6377 100644 GIT binary patch literal 2166 zcmah~ZENH<5ccPV{SSf^3QO#TwpXt0r@JIauiT}fx$Bowj4Y3Lm9;ghB=5%k^?gQm z_L5R43oFfwW=5mu8DFTkQ=-!F;B8N>+A8+R4)*W7zy9z+bQHwAoW#Z`Z3$ z<)=Rkw|`>$m7glp>jOx&9dQ9Y^We3fsiahJDZA`&7V}b!`yfz2Dvu?LuRd-X` zvskyu2&!5+`fNzbN^|+ z<=ymt@e0xm^*N3>GO4tuNOLb0GO1Ir30+dnq_mpTs%V1sNp*n-C=(67*NJkNhC0Tn zXVFz|R+PYKI9;m9N<(C2-2<7Va5364%mJE?BmTvhDye+X&UO@2ejra~F8*A)>UoF9 zlTKk*rAIi5&{9u8OWNC5wl*LQhiNuTnb&kgz|FSC1J?6xG6Vn=Do-2^5wDI5iSK7; z6YdxjuweK|LGMTf@{G4zul8x^p_Q&Dew4+{%abcKPuprEfJ30I7eXgBwzYLyB({T#;*HO zEa(`p$VT@yib;TYL|EXrsp4O~P4w;aefhv+a2mx*uy^3m;T$Ui&KZ)zPSk*6lw!GD zRow4Q(0)4mSLiw9In=%1Tlc71Hse4p<@$w0CT{C9(L{P7L{x)>+S@u%M}Z{09uwC) za(JXfLQu(lH@y|Q5^$b$*pX0F^uLVK!=P?CQNw&d?bc7OsqsMO;2AK)1B>VTyM^~Y zuuK`irqg3Dfp9`bLT%V}c9+{HC;UVX5R+GewFGKp%*@e%&uj$g)Ei?swCKF>!B1Xq zXY0kQA6{RA1Ik8V4MPu|X))}&KlJcYHiV5OHpxnAllby7mTOUdH|v{L`m3Ct$th@=P4Ym!?&Z^ZKA zH+SmQ>u-KiV}w5@7gf7T?QGhQ4X8D2uW{eD{Pl{R^?aJY9P_lNm_Hs;!WAY- zSda^zW~m{a$_SXWV-GHWQC8lza|!pD5+8NiAn zj>AC+`EQbLBc{ZB;b9U!SF)UkQq_uBJw$>2^Sz|LoA8}T6Y^)i2|dvleB02_LKmrM z3dS#DR~NQ$MF(kHtcw-oy70wRh@UC;1!c`_+Z5Ge6VG{{;4#Pfa$@%WzfAuCCwiw` literal 1893 zcmZuyO^@6*487}D2uOhfi?t23O_8>zWOsvZk&i_;>7^(PS*AVOu|-vqXV(4oeUxW{ zNzub-D3T&6>OJ~}ezRt(EYHDB)ayob$ZmH36#4euJ3T<3RNIEt8oQ&mZPZmA);}%x zf2pf)!dlzOq@>gfeWU3)lTVI)f>UBq-nx{d>y`{H!UtpbW^{g3^~0KX>{Yk8sc%$I z2bwrkG>!>7oWjaSKe6ed*wpp*H@U7+SbqGWQNhFvd)iJMOl;I~M>IhfT&Rj%&6G`V zvVq^uL~24wy&2(nC2=d~2X0jDe9C4*Gip1Gn@PQY{o(%Y-Qy>@(8GFv@`%{!&uM}8 zs`HK#ZQasHp^wpJG(&f*HhRmalZ`H9Jw%?N%ru9|WRfTy4U9tgQ>rJe8Ymjy%hZluf zPPQdbBei{s#wWD<1kRb|EN3y@J{do(VI!ayaf;T=O_h5-V#bVQU`)iPStiTxIWrw!g}_t(Nkb{`m!D$Krr*ti-Z%RB%O2D3~E0dFC3YuOKOlr|w~SjivC6TnpA znPegqg9~B--t}v1IriwSAUdqUG+--?FJAui>KZ84E&(~1OK3xnVN`H$r=r(*dHJX#U>h9fo_9?As>; zqX+sEB<}pdMfQT}r0N2XgRJ;9N~A{JefR@o-XH4D>^I#eNrJEfw;Jdxlkdz*h2bWK zlfG=vGEV)@f&S-Ag!;m3-V43{!?-4a=W?}q1*B_P&!eR7Wmi8XK>9+2@571-o>wF- M>;chmd3 Optional[list]: + return None + + def _evaluate(self, params: Dict[str, Any]) -> Tuple[float, Dict[str, Any]]: + """Run CatBoost CV and return mean test metric + metadata.""" + cv_params = params.copy() + + cv_params.setdefault("loss_function", self.loss_function) + + cv_params.setdefault("iterations", self.iterations) + + if self.metric != self.loss_function: + custom_metrics = cv_params.get("custom_metric", []) + if isinstance(custom_metrics, str): + custom_metrics = [custom_metrics] + if self.metric not in custom_metrics: + custom_metrics.append(self.metric) + cv_params["custom_metric"] = custom_metrics + + try: + cv_results: pd.DataFrame = cv( + params=cv_params, + pool=self.pool, + fold_count=self.fold_count, + early_stopping_rounds=self.early_stopping_rounds, + partition_random_seed=self.partition_random_seed, + type=self.type, + verbose=False, + plot=False, + return_models=False, + as_pandas=True, + ) + except Exception as e: + raise RuntimeError( + f"CatBoost CV failed with params: {cv_params}\n" + f"Error: {str(e)}" + ) from e + + target_col = f"test-{self.metric}-mean" + + if target_col not in cv_results.columns: + available = ", ".join(cv_results.columns) + raise ValueError( + f"Expected column '{target_col}' not found in cv_results.\n" + f"Available columns: {available}\n" + f"Check that metric='{self.metric}' is computed. " + f"Current loss_function: '{cv_params.get('loss_function')}'" + ) + + mean_score = float(cv_results[target_col].iloc[-1]) + + metadata = { + "cv_results": cv_results.to_dict(orient="records"), + "final_iteration": int(cv_results["iterations"].iloc[-1]), + "target_column": target_col, + "used_loss_function": cv_params["loss_function"], + "custom_metrics_used": cv_params.get("custom_metric", None), + } + + return mean_score, metadata \ No newline at end of file diff --git a/src/hyperactive/tests/test_integrations/test_catboost_cv.py b/src/hyperactive/tests/test_integrations/test_catboost_cv.py new file mode 100644 index 00000000..6448ee18 --- /dev/null +++ b/src/hyperactive/tests/test_integrations/test_catboost_cv.py @@ -0,0 +1,70 @@ +import numpy as np +import pytest +from catboost import Pool +from hyperactive.experiment.integrations import CatBoostCvExperiment + +@pytest.fixture +def dummy_binary_pool(): + """Create a small random binary classification dataset as CatBoost Pool.""" + np.random.seed(42) + X = np.random.rand(400, 8) + y = np.random.randint(0, 2, 400) + return Pool(data=X, label=y) + +def test_catboost_cv_runs_and_returns_valid_score(dummy_binary_pool): + """Basic sanity test: ensure CatBoostCvExperiment runs cv() correctly.""" + exp = CatBoostCvExperiment( + pool=dummy_binary_pool, + metric="Logloss", + fold_count=4, + iterations=50, + early_stopping_rounds=10, + ) + + params = { + "learning_rate": 0.03, + "depth": 5, + "l2_leaf_reg": 3.0, + } + + raw_score, metadata = exp.evaluate(params) + signed_score, _ = exp.score(params) + + assert isinstance(raw_score, float) + assert 0 < raw_score < 1, f"Logloss out of expected range: {raw_score:.4f}" + assert signed_score < 0, "Signed score should be negative (lower is better)" + assert exp.get_tag("property:higher_or_lower_is_better") == "lower" + + assert isinstance(metadata, dict) + assert "cv_results" in metadata + assert "target_column" in metadata + assert metadata["target_column"] == "test-Logloss-mean" + assert "final_iteration" in metadata + assert metadata["final_iteration"] > 0 + assert "used_loss_function" in metadata + assert metadata["used_loss_function"] == "Logloss" + +def test_catboost_cv_with_auc(dummy_binary_pool): + """Test AUC metric with Logloss objective (shows metric vs loss separation).""" + exp = CatBoostCvExperiment( + pool=dummy_binary_pool, + loss_function="Logloss", + metric="AUC", + fold_count=3, + iterations=30, + ) + + params = {"learning_rate": 0.05, "depth": 4} + + raw_score, metadata = exp.evaluate(params) + signed_score, _ = exp.score(params) + + # AUC on random data is noisy, but typically around 0.5 ± 0.2–0.3 + assert 0.25 < raw_score < 0.75, f"Unexpected AUC on random data: {raw_score:.4f}" + + # Optimizing Logloss (lower-better) → tag is "lower", signed_score is negative + assert signed_score < 0, f"Signed score should be negative, got {signed_score:.4f}" + + assert exp.get_tag("property:higher_or_lower_is_better") == "lower" + assert "used_loss_function" in metadata + assert metadata["used_loss_function"] == "Logloss" \ No newline at end of file