From 2f98934f73e99d1ca51032a3844e9a1992935e26 Mon Sep 17 00:00:00 2001 From: Prtm2110 Date: Tue, 24 Feb 2026 15:23:57 +0530 Subject: [PATCH 1/5] added ontology --- src/database/datasets.py | 17 +++++++++++++++++ src/routers/openml/datasets.py | 7 +++++++ src/schemas/datasets/openml.py | 1 + 3 files changed, 25 insertions(+) diff --git a/src/database/datasets.py b/src/database/datasets.py index f011a651..aff4042a 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -131,6 +131,23 @@ def get_features(dataset_id: int, connection: Connection) -> list[Feature]: return [Feature(**row, nominal_values=None) for row in rows.mappings()] +def get_feature_ontologies(dataset_id: int, connection: Connection) -> dict[int, list[str]]: + rows = connection.execute( + text( + """ + SELECT `index`, `value` + FROM data_feature_description + WHERE `did` = :dataset_id AND `description_type` = 'ontology' + """, + ), + parameters={"dataset_id": dataset_id}, + ) + ontologies: dict[int, list[str]] = {} + for row in rows: + ontologies.setdefault(row.index, []).append(row.value) + return ontologies + + def get_feature_values(dataset_id: int, *, feature_index: int, connection: Connection) -> list[str]: rows = connection.execute( text( diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index dda25117..856d6ba1 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -287,6 +287,13 @@ def get_dataset_features( ) -> list[Feature]: _get_dataset_raise_otherwise(dataset_id, user, expdb) features = database.datasets.get_features(dataset_id, expdb) + + # Attach ontologies from data_feature_description + ontologies = database.datasets.get_feature_ontologies(dataset_id, expdb) + for feature in features: + if feature.index in ontologies: + feature.ontology = ontologies[feature.index] + for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]: feature.nominal_values = database.datasets.get_feature_values( dataset_id, diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py index 8edb373c..b1f51574 100644 --- a/src/schemas/datasets/openml.py +++ b/src/schemas/datasets/openml.py @@ -40,6 +40,7 @@ class Feature(BaseModel): index: int name: str data_type: FeatureType + ontology: list[str] | None = None is_target: bool is_ignore: bool is_row_identifier: bool From 732b32622af39fe22920fa35518c8532a7a2d3c7 Mon Sep 17 00:00:00 2001 From: Prtm2110 Date: Tue, 24 Feb 2026 15:45:58 +0530 Subject: [PATCH 2/5] changed to use .mappings() --- src/database/datasets.py | 4 ++-- src/routers/openml/datasets.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/database/datasets.py b/src/database/datasets.py index aff4042a..f18b21e7 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -143,8 +143,8 @@ def get_feature_ontologies(dataset_id: int, connection: Connection) -> dict[int, parameters={"dataset_id": dataset_id}, ) ontologies: dict[int, list[str]] = {} - for row in rows: - ontologies.setdefault(row.index, []).append(row.value) + for row in rows.mappings(): + ontologies.setdefault(row["index"], []).append(row["value"]) return ontologies diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 856d6ba1..1072296b 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -291,8 +291,7 @@ def get_dataset_features( # Attach ontologies from data_feature_description ontologies = database.datasets.get_feature_ontologies(dataset_id, expdb) for feature in features: - if feature.index in ontologies: - feature.ontology = ontologies[feature.index] + feature.ontology = ontologies.get(feature.index) for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]: feature.nominal_values = database.datasets.get_feature_values( From 7ef810b38625f197eafa2791aa3831eea80dfb39 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 16 Mar 2026 12:18:26 +0100 Subject: [PATCH 3/5] Use async database connection for fetching ontology data --- src/database/datasets.py | 7 +++++-- src/routers/openml/datasets.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/database/datasets.py b/src/database/datasets.py index 1153f1e1..5b65d5a4 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -134,8 +134,11 @@ async def get_features(dataset_id: int, connection: AsyncConnection) -> list[Fea return [Feature(**row, nominal_values=None) for row in rows] -def get_feature_ontologies(dataset_id: int, connection: Connection) -> dict[int, list[str]]: - rows = connection.execute( +async def get_feature_ontologies( + dataset_id: int, + connection: AsyncConnection, +) -> dict[int, list[str]]: + rows = await connection.execute( text( """ SELECT `index`, `value` diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 2e87f000..164efd7d 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -294,7 +294,7 @@ async def get_dataset_features( assert expdb is not None # noqa: S101 await _get_dataset_raise_otherwise(dataset_id, user, expdb) features = await database.datasets.get_features(dataset_id, expdb) - ontologies = database.datasets.get_feature_ontologies(dataset_id, expdb) + ontologies = await database.datasets.get_feature_ontologies(dataset_id, expdb) for feature in features: feature.ontology = ontologies.get(feature.index) From 829e56af3e46929f7856e12d679b06b7e52df225 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 16 Mar 2026 12:26:00 +0100 Subject: [PATCH 4/5] Use defaultdict instead. We always want the same default --- src/database/datasets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/database/datasets.py b/src/database/datasets.py index 5b65d5a4..26eb33d8 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -1,6 +1,7 @@ """Translation from https://github.com/openml/OpenML/blob/c19c9b99568c0fabb001e639ff6724b9a754bbc9/openml_OS/models/api/v1/Api_data.php#L707.""" import datetime +from collections import defaultdict from sqlalchemy import text from sqlalchemy.engine import Row @@ -148,9 +149,9 @@ async def get_feature_ontologies( ), parameters={"dataset_id": dataset_id}, ) - ontologies: dict[int, list[str]] = {} + ontologies: dict[int, list[str]] = defaultdict(list) for row in rows.mappings(): - ontologies.setdefault(row["index"], []).append(row["value"]) + ontologies[row["index"]].append(row["value"]) return ontologies From 40caa49085fa35ab9dc5ff535d753ac424b497c9 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 16 Mar 2026 12:33:52 +0100 Subject: [PATCH 5/5] Coverage for ontology will be added with 262 --- tests/routers/openml/migration/datasets_migration_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 5ff6fe86..75f30863 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -259,6 +259,8 @@ async def test_datasets_feature_is_identical( values = feature.pop(key) # The old API returns a str if there is only a single element feature["nominal_value"] = values if len(values) > 1 else values[0] + elif key == "ontology": + del feature[key] # Added back in with follow up PR #262 else: # The old API formats bool as string in lower-case feature[key] = str(value) if not isinstance(value, bool) else str(value).lower()