__init__.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. # Short term workaround for https://github.com/ray-project/ray/issues/32435
  2. # Dataset has a hard dependency on pandas, so it doesn't need to be delayed.
  3. import pandas # noqa
  4. from packaging.version import parse as parse_version
  5. from ray.data._internal.utils.arrow_utils import get_pyarrow_version
  6. from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy
  7. from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions
  8. from ray.data._internal.execution.interfaces import (
  9. ExecutionOptions,
  10. ExecutionResources,
  11. NodeIdStr,
  12. )
  13. from ray.data._internal.logging import configure_logging
  14. from ray.data.context import DataContext, DatasetContext
  15. from ray.data.dataset import (
  16. Dataset,
  17. Schema,
  18. SinkMode,
  19. ClickHouseTableSettings,
  20. SaveMode,
  21. )
  22. from ray.data.stats import DatasetSummary
  23. from ray.data.datasource import (
  24. BlockBasedFileDatasink,
  25. Datasink,
  26. Datasource,
  27. FileShuffleConfig,
  28. ReadTask,
  29. RowBasedFileDatasink,
  30. )
  31. from ray.data.iterator import DataIterator, DatasetIterator
  32. from ray.data.preprocessor import Preprocessor
  33. from ray.data.read_api import ( # noqa: F401
  34. KafkaAuthConfig, # noqa: F401
  35. from_arrow,
  36. from_arrow_refs,
  37. from_blocks,
  38. from_daft,
  39. from_dask,
  40. from_huggingface,
  41. from_items,
  42. from_mars,
  43. from_modin,
  44. from_numpy,
  45. from_numpy_refs,
  46. from_pandas,
  47. from_pandas_refs,
  48. from_spark,
  49. from_tf,
  50. from_torch,
  51. range,
  52. range_tensor,
  53. read_audio,
  54. read_avro,
  55. read_bigquery,
  56. read_binary_files,
  57. read_clickhouse,
  58. read_csv,
  59. read_databricks_tables,
  60. read_datasource,
  61. read_delta,
  62. read_delta_sharing_tables,
  63. read_kafka,
  64. read_hudi,
  65. read_iceberg,
  66. read_images,
  67. read_json,
  68. read_lance,
  69. read_mcap,
  70. read_mongo,
  71. read_numpy,
  72. read_parquet,
  73. read_snowflake,
  74. read_sql,
  75. read_text,
  76. read_tfrecords,
  77. read_unity_catalog,
  78. read_videos,
  79. read_webdataset,
  80. )
  81. # Module-level cached global functions for callable classes. It needs to be defined here
  82. # since it has to be process-global across cloudpickled funcs.
  83. _map_actor_context = None
  84. configure_logging()
  85. try:
  86. import pyarrow as pa
  87. # Import these arrow extension types to ensure that they are registered.
  88. from ray.data._internal.tensor_extensions.arrow import ( # noqa
  89. ArrowTensorType,
  90. ArrowVariableShapedTensorType,
  91. )
  92. # https://github.com/apache/arrow/pull/38608 deprecated `PyExtensionType`, and
  93. # disabled it's deserialization by default. To ensure that users can load data
  94. # written with earlier version of Ray Data, we enable auto-loading of serialized
  95. # tensor extensions.
  96. #
  97. # NOTE: `PyExtensionType` is deleted from Arrow >= 21.0
  98. pyarrow_version = get_pyarrow_version()
  99. if pyarrow_version is None or pyarrow_version >= parse_version("21.0.0"):
  100. pass
  101. else:
  102. from ray._private.ray_constants import env_bool
  103. RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE = env_bool(
  104. "RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE", False
  105. )
  106. if (
  107. pyarrow_version >= parse_version("14.0.1")
  108. and RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE
  109. ):
  110. pa.PyExtensionType.set_auto_load(True)
  111. except ModuleNotFoundError:
  112. pass
  113. __all__ = [
  114. "ActorPoolStrategy",
  115. "BlockBasedFileDatasink",
  116. "ClickHouseTableSettings",
  117. "Dataset",
  118. "DataContext",
  119. "DatasetContext", # Backwards compatibility alias.
  120. "DatasetSummary",
  121. "DataIterator",
  122. "DatasetIterator", # Backwards compatibility alias.
  123. "Datasink",
  124. "Datasource",
  125. "ExecutionOptions",
  126. "ExecutionResources",
  127. "FileShuffleConfig",
  128. "NodeIdStr",
  129. "ReadTask",
  130. "RowBasedFileDatasink",
  131. "Schema",
  132. "SinkMode",
  133. "SaveMode",
  134. "TaskPoolStrategy",
  135. "from_daft",
  136. "from_dask",
  137. "from_items",
  138. "from_arrow",
  139. "from_arrow_refs",
  140. "from_mars",
  141. "from_modin",
  142. "from_numpy",
  143. "from_numpy_refs",
  144. "from_pandas",
  145. "from_pandas_refs",
  146. "from_spark",
  147. "from_tf",
  148. "from_torch",
  149. "from_huggingface",
  150. "range",
  151. "range_tensor",
  152. "read_audio",
  153. "read_avro",
  154. "read_text",
  155. "read_binary_files",
  156. "read_clickhouse",
  157. "read_csv",
  158. "read_datasource",
  159. "read_delta",
  160. "read_delta_sharing_tables",
  161. "read_kafka",
  162. "KafkaAuthConfig",
  163. "read_hudi",
  164. "read_iceberg",
  165. "read_images",
  166. "read_json",
  167. "read_lance",
  168. "read_mcap",
  169. "read_numpy",
  170. "read_mongo",
  171. "read_parquet",
  172. "read_snowflake",
  173. "read_sql",
  174. "read_tfrecords",
  175. "read_unity_catalog",
  176. "read_videos",
  177. "read_webdataset",
  178. "Preprocessor",
  179. "TFXReadOptions",
  180. ]