| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- # Short term workaround for https://github.com/ray-project/ray/issues/32435
- # Dataset has a hard dependency on pandas, so it doesn't need to be delayed.
- import pandas # noqa
- from packaging.version import parse as parse_version
- from ray.data._internal.utils.arrow_utils import get_pyarrow_version
- from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy
- from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions
- from ray.data._internal.execution.interfaces import (
- ExecutionOptions,
- ExecutionResources,
- NodeIdStr,
- )
- from ray.data._internal.logging import configure_logging
- from ray.data.context import DataContext, DatasetContext
- from ray.data.dataset import (
- Dataset,
- Schema,
- SinkMode,
- ClickHouseTableSettings,
- SaveMode,
- )
- from ray.data.stats import DatasetSummary
- from ray.data.datasource import (
- BlockBasedFileDatasink,
- Datasink,
- Datasource,
- FileShuffleConfig,
- ReadTask,
- RowBasedFileDatasink,
- )
- from ray.data.iterator import DataIterator, DatasetIterator
- from ray.data.preprocessor import Preprocessor
- from ray.data.read_api import ( # noqa: F401
- KafkaAuthConfig, # noqa: F401
- from_arrow,
- from_arrow_refs,
- from_blocks,
- from_daft,
- from_dask,
- from_huggingface,
- from_items,
- from_mars,
- from_modin,
- from_numpy,
- from_numpy_refs,
- from_pandas,
- from_pandas_refs,
- from_spark,
- from_tf,
- from_torch,
- range,
- range_tensor,
- read_audio,
- read_avro,
- read_bigquery,
- read_binary_files,
- read_clickhouse,
- read_csv,
- read_databricks_tables,
- read_datasource,
- read_delta,
- read_delta_sharing_tables,
- read_kafka,
- read_hudi,
- read_iceberg,
- read_images,
- read_json,
- read_lance,
- read_mcap,
- read_mongo,
- read_numpy,
- read_parquet,
- read_snowflake,
- read_sql,
- read_text,
- read_tfrecords,
- read_unity_catalog,
- read_videos,
- read_webdataset,
- )
- # Module-level cached global functions for callable classes. It needs to be defined here
- # since it has to be process-global across cloudpickled funcs.
- _map_actor_context = None
- configure_logging()
- try:
- import pyarrow as pa
- # Import these arrow extension types to ensure that they are registered.
- from ray.data._internal.tensor_extensions.arrow import ( # noqa
- ArrowTensorType,
- ArrowVariableShapedTensorType,
- )
- # https://github.com/apache/arrow/pull/38608 deprecated `PyExtensionType`, and
- # disabled it's deserialization by default. To ensure that users can load data
- # written with earlier version of Ray Data, we enable auto-loading of serialized
- # tensor extensions.
- #
- # NOTE: `PyExtensionType` is deleted from Arrow >= 21.0
- pyarrow_version = get_pyarrow_version()
- if pyarrow_version is None or pyarrow_version >= parse_version("21.0.0"):
- pass
- else:
- from ray._private.ray_constants import env_bool
- RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE = env_bool(
- "RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE", False
- )
- if (
- pyarrow_version >= parse_version("14.0.1")
- and RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE
- ):
- pa.PyExtensionType.set_auto_load(True)
- except ModuleNotFoundError:
- pass
- __all__ = [
- "ActorPoolStrategy",
- "BlockBasedFileDatasink",
- "ClickHouseTableSettings",
- "Dataset",
- "DataContext",
- "DatasetContext", # Backwards compatibility alias.
- "DatasetSummary",
- "DataIterator",
- "DatasetIterator", # Backwards compatibility alias.
- "Datasink",
- "Datasource",
- "ExecutionOptions",
- "ExecutionResources",
- "FileShuffleConfig",
- "NodeIdStr",
- "ReadTask",
- "RowBasedFileDatasink",
- "Schema",
- "SinkMode",
- "SaveMode",
- "TaskPoolStrategy",
- "from_daft",
- "from_dask",
- "from_items",
- "from_arrow",
- "from_arrow_refs",
- "from_mars",
- "from_modin",
- "from_numpy",
- "from_numpy_refs",
- "from_pandas",
- "from_pandas_refs",
- "from_spark",
- "from_tf",
- "from_torch",
- "from_huggingface",
- "range",
- "range_tensor",
- "read_audio",
- "read_avro",
- "read_text",
- "read_binary_files",
- "read_clickhouse",
- "read_csv",
- "read_datasource",
- "read_delta",
- "read_delta_sharing_tables",
- "read_kafka",
- "KafkaAuthConfig",
- "read_hudi",
- "read_iceberg",
- "read_images",
- "read_json",
- "read_lance",
- "read_mcap",
- "read_numpy",
- "read_mongo",
- "read_parquet",
- "read_snowflake",
- "read_sql",
- "read_tfrecords",
- "read_unity_catalog",
- "read_videos",
- "read_webdataset",
- "Preprocessor",
- "TFXReadOptions",
- ]
|