Skip to content
Open
29 changes: 21 additions & 8 deletions examples/Advanced/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
#
# We will start by simply listing only *supervised classification* tasks.
#
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# **openml.list_tasks()** (or **openml.tasks.list_tasks()**) returns a dictionary of dictionaries by default, but we request a
# [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
# instead to have better visualization capabilities and easier access:

# %%
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
# New: top-level convenience alias
tasks = openml.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
# Old path still works:
# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
Expand Down Expand Up @@ -66,23 +68,29 @@
# Similar to listing tasks by task type, we can list tasks by tags:

# %%
tasks = openml.tasks.list_tasks(tag="OpenML100")
tasks = openml.list_tasks(tag="OpenML100")
# Old path still works:
# tasks = openml.tasks.list_tasks(tag="OpenML100")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# %% [markdown]
# Furthermore, we can list tasks based on the dataset id:

# %%
tasks = openml.tasks.list_tasks(data_id=1471)
tasks = openml.list_tasks(data_id=1471)
# Old path still works:
# tasks = openml.tasks.list_tasks(data_id=1471)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# %% [markdown]
# In addition, a size limit and an offset can be applied both separately and simultaneously:

# %%
tasks = openml.tasks.list_tasks(size=10, offset=50)
tasks = openml.list_tasks(size=10, offset=50)
# Old path still works:
# tasks = openml.tasks.list_tasks(size=10, offset=50)
print(tasks)

# %% [markdown]
Expand All @@ -98,7 +106,9 @@
# Finally, it is also possible to list all tasks on OpenML with:

# %%
tasks = openml.tasks.list_tasks()
tasks = openml.list_tasks()
# Old path still works:
# tasks = openml.tasks.list_tasks()
print(len(tasks))

# %% [markdown]
Expand All @@ -118,7 +128,10 @@

# %%
task_id = 31
task = openml.tasks.get_task(task_id)
# New: top-level convenience alias
task = openml.get_task(task_id)
# Old path still works:
# task = openml.tasks.get_task(task_id)

# %%
# Properties of the task are stored as member variables:
Expand Down
10 changes: 8 additions & 2 deletions examples/Basics/simple_datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@
# ## List datasets stored on OpenML

# %%
datasets_df = openml.datasets.list_datasets()
# New: top-level convenience alias
datasets_df = openml.list_datasets()
# Old path still works for backwards compatibility:
# datasets_df = openml.datasets.list_datasets()
print(datasets_df.head(n=10))

# %% [markdown]
# ## Download a dataset

# %%
# Iris dataset https://www.openml.org/d/61
dataset = openml.datasets.get_dataset(dataset_id=61)
# New: top-level convenience alias
dataset = openml.get_dataset(dataset_id=61)
# Old path still works:
# dataset = openml.datasets.get_dataset(dataset_id=61)

# Print a summary
print(
Expand Down
7 changes: 5 additions & 2 deletions examples/Basics/simple_flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@
# NOTE: We are using task 119 from the test server: https://test.openml.org/d/20

# %%
task = openml.tasks.get_task(119)
# New: top-level convenience alias
task = openml.get_task(119)
# Old path still works:
# task = openml.tasks.get_task(119)

# Get the data
dataset = task.get_dataset()
Expand All @@ -54,7 +57,7 @@

# %% [markdown]
# ## Upload the machine learning experiments to OpenML
# First, create a fow and fill it with metadata about the machine learning model.
# First, create a flow and fill it with metadata about the machine learning model.

# %%
knn_flow = openml.flows.OpenMLFlow(
Expand Down
5 changes: 4 additions & 1 deletion examples/Basics/simple_tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
# [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):

# %%
task = openml.tasks.get_task(31)
# New: top-level convenience alias
task = openml.get_task(31)
# Old path still works:
# task = openml.tasks.get_task(31)

# %% [markdown]
# Get the dataset and its data from the task.
Expand Down
13 changes: 13 additions & 0 deletions openml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@
)
from .__version__ import __version__
from .datasets import OpenMLDataFeature, OpenMLDataset
from .datasets.functions import get_dataset, list_datasets
from .evaluations import OpenMLEvaluation
from .flows import OpenMLFlow
from .flows.functions import get_flow, list_flows
from .runs import OpenMLRun
from .runs.functions import get_run, list_runs
from .setups import OpenMLParameter, OpenMLSetup
from .study import OpenMLBenchmarkSuite, OpenMLStudy
from .tasks import (
Expand All @@ -48,6 +51,7 @@
OpenMLSupervisedTask,
OpenMLTask,
)
from .tasks.functions import get_task, list_tasks


def populate_cache(
Expand Down Expand Up @@ -98,6 +102,7 @@ def populate_cache(
"OpenMLDataset",
"OpenMLEvaluation",
"OpenMLFlow",
"OpenMLFlow",
"OpenMLLearningCurveTask",
"OpenMLParameter",
"OpenMLRegressionTask",
Expand All @@ -115,6 +120,14 @@ def populate_cache(
"exceptions",
"extensions",
"flows",
"get_dataset",
"get_flow",
"get_run",
"get_task",
"list_datasets",
"list_flows",
"list_runs",
"list_tasks",
"runs",
"setups",
"study",
Expand Down
14 changes: 14 additions & 0 deletions tests/test_openml/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,17 @@ def test_populate_cache(
assert task_mock.call_count == 2
for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]):
assert argument[0] == fixture

def test_top_level_getters_aliases(self):
# Ensure top-level convenience aliases point to existing implementations.
assert openml.list_datasets is openml.datasets.functions.list_datasets
assert openml.get_dataset is openml.datasets.functions.get_dataset

assert openml.list_flows is openml.flows.functions.list_flows
assert openml.get_flow is openml.flows.functions.get_flow

assert openml.list_runs is openml.runs.functions.list_runs
assert openml.get_run is openml.runs.functions.get_run

assert openml.list_tasks is openml.tasks.functions.list_tasks
assert openml.get_task is openml.tasks.functions.get_task
53 changes: 53 additions & 0 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,59 @@ def test_local_run_metric_score(self):

self._test_local_evaluations(run)

@pytest.mark.sklearn()
@pytest.mark.uses_test_server()
def test_run_flow_on_task_basic(self):
"""Test that run_flow_on_task executes successfully with basic flow and task."""
# construct sci-kit learn classifier
clf = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore")),
("estimator", RandomForestClassifier(n_estimators=5, random_state=42)),
],
)

# convert model to flow
flow = self.extension.model_to_flow(clf)

task = openml.tasks.get_task(119)

run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
upload_flow=False,
)

# verify run was created successfully
assert run.task_id == task.task_id
assert run.flow_name == flow.name
assert run.dataset_id == task.dataset_id
assert run.data_content is not None
assert len(run.data_content) > 0

@pytest.mark.sklearn()
@pytest.mark.uses_test_server()
def test_run_flow_on_task_with_flow_tags(self):
"""Test run_flow_on_task with custom flow tags (for the flow, not the run)."""
clf = RandomForestClassifier(n_estimators=5, random_state=42)
flow = self.extension.model_to_flow(clf)
task = openml.tasks.get_task(119)

# invoke run_flow_on_task with custom flow tags
# Note: flow_tags are tags for the flow object, not the run
run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
flow_tags=["test_flow_tag_1", "test_flow_tag_2"],
upload_flow=False,
)

# verify run was created successfully
assert run.task_id == task.task_id
assert run.flow_name == flow.name
assert run.data_content is not None

@pytest.mark.production()
def test_online_run_metric_score(self):
self.use_production_server()
Expand Down