Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ classifiers = [
"Programming Language :: Python",
"Programming Language :: Rust",
]
dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"]
dependencies = ["typing-extensions;python_version<'3.13'"]
dynamic = ["version"]

[project.optional-dependencies]
pyarrow = ["pyarrow>=11.0.0"]

[project.urls]
homepage = "https://datafusion.apache.org/python"
documentation = "https://datafusion.apache.org/python"
Expand Down
45 changes: 30 additions & 15 deletions python/datafusion/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

import pandas as pd
import polars as pl # type: ignore[import]
import pyarrow as pa # Optional: only needed for type hints

from datafusion.catalog import CatalogProvider, Table
from datafusion.expr import SortKey
Expand All @@ -58,6 +59,16 @@
)


class ArrowSchemaExportable(Protocol):
"""Type hint for object exporting Arrow Schema via Arrow PyCapsule Interface.

https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
"""

def __arrow_c_schema__(self) -> object: # noqa: D105
...


class ArrowStreamExportable(Protocol):
"""Type hint for object exporting Arrow C Stream via Arrow PyCapsule Interface.

Expand All @@ -66,7 +77,8 @@ class ArrowStreamExportable(Protocol):

def __arrow_c_stream__( # noqa: D105
self, requested_schema: object | None = None
) -> object: ...
) -> object:
...


class ArrowArrayExportable(Protocol):
Expand All @@ -77,7 +89,8 @@ class ArrowArrayExportable(Protocol):

def __arrow_c_array__( # noqa: D105
self, requested_schema: object | None = None
) -> tuple[object, object]: ...
) -> tuple[object, object]:
...


class TableProviderExportable(Protocol):
Expand All @@ -86,7 +99,8 @@ class TableProviderExportable(Protocol):
https://datafusion.apache.org/python/user-guide/io/table_provider.html
"""

def __datafusion_table_provider__(self) -> object: ... # noqa: D105
def __datafusion_table_provider__(self) -> object: # noqa: D105
...


class CatalogProviderExportable(Protocol):
Expand All @@ -95,7 +109,8 @@ class CatalogProviderExportable(Protocol):
https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html
"""

def __datafusion_catalog_provider__(self) -> object: ... # noqa: D105
def __datafusion_catalog_provider__(self) -> object: # noqa: D105
...


class SessionConfig:
Expand Down Expand Up @@ -561,7 +576,7 @@ def register_listing_table(
path: str | pathlib.Path,
table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
file_extension: str = ".parquet",
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
file_sort_order: Sequence[Sequence[SortKey]] | None = None,
) -> None:
"""Register multiple files as a single table.
Expand Down Expand Up @@ -630,7 +645,7 @@ def create_dataframe(
self,
partitions: list[list[pa.RecordBatch]],
name: str | None = None,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
) -> DataFrame:
"""Create and return a dataframe using the provided partitions.

Expand Down Expand Up @@ -820,7 +835,7 @@ def register_parquet(
parquet_pruning: bool = True,
file_extension: str = ".parquet",
skip_metadata: bool = True,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
file_sort_order: Sequence[Sequence[SortKey]] | None = None,
) -> None:
"""Register a Parquet file as a table.
Expand Down Expand Up @@ -862,7 +877,7 @@ def register_csv(
self,
name: str,
path: str | pathlib.Path | list[str | pathlib.Path],
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
has_header: bool = True,
delimiter: str = ",",
schema_infer_max_records: int = 1000,
Expand Down Expand Up @@ -905,7 +920,7 @@ def register_json(
self,
name: str,
path: str | pathlib.Path,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
schema_infer_max_records: int = 1000,
file_extension: str = ".json",
table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
Expand Down Expand Up @@ -944,7 +959,7 @@ def register_avro(
self,
name: str,
path: str | pathlib.Path,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
file_extension: str = ".avro",
table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
) -> None:
Expand Down Expand Up @@ -1019,7 +1034,7 @@ def session_id(self) -> str:
def read_json(
self,
path: str | pathlib.Path,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
schema_infer_max_records: int = 1000,
file_extension: str = ".json",
table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
Expand Down Expand Up @@ -1057,7 +1072,7 @@ def read_json(
def read_csv(
self,
path: str | pathlib.Path | list[str] | list[pathlib.Path],
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
has_header: bool = True,
delimiter: str = ",",
schema_infer_max_records: int = 1000,
Expand Down Expand Up @@ -1111,7 +1126,7 @@ def read_parquet(
parquet_pruning: bool = True,
file_extension: str = ".parquet",
skip_metadata: bool = True,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
file_sort_order: Sequence[Sequence[SortKey]] | None = None,
) -> DataFrame:
"""Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`.
Expand Down Expand Up @@ -1155,7 +1170,7 @@ def read_parquet(
def read_avro(
self,
path: str | pathlib.Path,
schema: pa.Schema | None = None,
schema: ArrowSchemaExportable | None = None,
file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
file_extension: str = ".avro",
) -> DataFrame:
Expand Down Expand Up @@ -1241,4 +1256,4 @@ def _convert_table_partition_cols(
stacklevel=2,
)

return converted_table_partition_cols
return converted_table_partition_cols
Loading