_ragged_array.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. """Provides a conversion to / from a ragged array representation of geometries.
  2. A ragged (or "jagged") array is an irregular array of arrays of which each
  3. element can have a different length. As a result, such an array cannot be
  4. represented as a standard, rectangular nD array.
  5. The coordinates of geometries can be represented as arrays of arrays of
  6. coordinate pairs (possibly multiple levels of nesting, depending on the
  7. geometry type).
  8. Geometries, as a ragged array of coordinates, can be efficiently represented
  9. as contiguous arrays of coordinates provided that there is another data
  10. structure that keeps track of which range of coordinate values corresponds
  11. to a given geometry. This can be done using offsets, counts, or indices.
  12. This module currently implements offsets into the coordinates array. This
  13. is the ragged array representation defined by the the Apache Arrow project
  14. as "variable size list array" (https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout).
  15. See for example https://cfconventions.org/Data/cf-conventions/cf-conventions-1.9/cf-conventions.html#representations-features
  16. for different options.
  17. The exact usage of the Arrow list array with varying degrees of nesting for the
  18. different geometry types is defined by the GeoArrow project:
  19. https://github.com/geoarrow/geoarrow
  20. """
  21. import numpy as np
  22. from shapely import creation, geos_version
  23. from shapely._geometry import (
  24. GeometryType,
  25. get_parts,
  26. get_rings,
  27. get_type_id,
  28. )
  29. from shapely._geometry_helpers import (
  30. _from_ragged_array_multi_linear,
  31. _from_ragged_array_multipolygon,
  32. )
  33. from shapely.coordinates import get_coordinates
  34. from shapely.predicates import is_empty, is_missing
  35. __all__ = ["from_ragged_array", "to_ragged_array"]
  36. _geos_ge_312 = geos_version >= (3, 12, 0)
  37. # # GEOS -> coords/offset arrays (to_ragged_array)
  38. def _get_arrays_point(arr, include_z, include_m):
  39. # only one array of coordinates
  40. coords = get_coordinates(arr, include_z=include_z, include_m=include_m)
  41. # empty points are represented by NaNs
  42. # + missing geometries should also be present with some value
  43. empties = is_empty(arr) | is_missing(arr)
  44. if empties.any():
  45. indices = np.nonzero(empties)[0]
  46. indices = indices - np.arange(len(indices))
  47. coords = np.insert(coords, indices, np.nan, axis=0)
  48. return coords, ()
  49. def _indices_to_offsets(indices, n):
  50. # default to int32 offsets if possible (to prefer the non-large arrow list variants)
  51. # n_coords is the length of the array the indices are poin
  52. if len(indices) > 2147483647:
  53. dtype = np.int64
  54. else:
  55. dtype = np.int32
  56. offsets = np.insert(np.bincount(indices).cumsum(dtype=dtype), 0, 0)
  57. if len(offsets) != n + 1:
  58. # last geometries might be empty or missing
  59. offsets = np.pad(
  60. offsets,
  61. (0, n + 1 - len(offsets)),
  62. "constant",
  63. constant_values=offsets[-1],
  64. )
  65. return offsets
  66. def _get_arrays_multipoint(arr, include_z, include_m):
  67. # explode/flatten the MultiPoints
  68. _, part_indices = get_parts(arr, return_index=True)
  69. # the offsets into the multipoint parts
  70. offsets = _indices_to_offsets(part_indices, len(arr))
  71. # only one array of coordinates
  72. coords = get_coordinates(arr, include_z=include_z, include_m=include_m)
  73. return coords, (offsets,)
  74. def _get_arrays_linestring(arr, include_z, include_m):
  75. # the coords and offsets into the coordinates of the linestrings
  76. coords, indices = get_coordinates(
  77. arr, return_index=True, include_z=include_z, include_m=include_m
  78. )
  79. offsets = _indices_to_offsets(indices, len(arr))
  80. return coords, (offsets,)
  81. def _get_arrays_multilinestring(arr, include_z, include_m):
  82. # explode/flatten the MultiLineStrings
  83. arr_flat, part_indices = get_parts(arr, return_index=True)
  84. # the offsets into the multilinestring parts
  85. offsets2 = _indices_to_offsets(part_indices, len(arr))
  86. # the coords and offsets into the coordinates of the linestrings
  87. coords, indices = get_coordinates(
  88. arr_flat, return_index=True, include_z=include_z, include_m=include_m
  89. )
  90. offsets1 = _indices_to_offsets(indices, len(arr_flat))
  91. return coords, (offsets1, offsets2)
  92. def _get_arrays_polygon(arr, include_z, include_m):
  93. # explode/flatten the Polygons into Rings
  94. arr_flat, ring_indices = get_rings(arr, return_index=True)
  95. # the offsets into the exterior/interior rings of the multipolygon parts
  96. offsets2 = _indices_to_offsets(ring_indices, len(arr))
  97. # the coords and offsets into the coordinates of the rings
  98. coords, indices = get_coordinates(
  99. arr_flat, return_index=True, include_z=include_z, include_m=include_m
  100. )
  101. offsets1 = _indices_to_offsets(indices, len(arr_flat))
  102. return coords, (offsets1, offsets2)
  103. def _get_arrays_multipolygon(arr, include_z, include_m):
  104. # explode/flatten the MultiPolygons
  105. arr_flat, part_indices = get_parts(arr, return_index=True)
  106. # the offsets into the multipolygon parts
  107. offsets3 = _indices_to_offsets(part_indices, len(arr))
  108. # explode/flatten the Polygons into Rings
  109. arr_flat2, ring_indices = get_rings(arr_flat, return_index=True)
  110. # the offsets into the exterior/interior rings of the multipolygon parts
  111. offsets2 = _indices_to_offsets(ring_indices, len(arr_flat))
  112. # the coords and offsets into the coordinates of the rings
  113. coords, indices = get_coordinates(
  114. arr_flat2, return_index=True, include_z=include_z, include_m=include_m
  115. )
  116. offsets1 = _indices_to_offsets(indices, len(arr_flat2))
  117. return coords, (offsets1, offsets2, offsets3)
  118. def to_ragged_array(geometries, include_z=None, include_m=None):
  119. """Convert geometries to a ragged array representation.
  120. This function converts an array of geometries to a ragged array
  121. (i.e. irregular array of arrays) of coordinates, represented in memory
  122. using a single contiguous array of the coordinates, and
  123. up to 3 offset arrays that keep track where each sub-array
  124. starts and ends.
  125. This follows the in-memory layout of the variable size list arrays defined
  126. by Apache Arrow, as specified for geometries by the GeoArrow project:
  127. https://github.com/geoarrow/geoarrow.
  128. Parameters
  129. ----------
  130. geometries : array_like
  131. Array of geometries (1-dimensional).
  132. include_z, include_m : bool, default None
  133. If both are False, return XY (2D) geometries.
  134. If both are True, return XYZM (4D) geometries.
  135. If either is True, return either XYZ or XYM (3D) geometries.
  136. If a geometry has no Z or M dimension, extra coordinate data will be NaN.
  137. By default, will infer the dimensionality from the
  138. input geometries. Note that this inference can be unreliable with
  139. empty geometries (for a guaranteed result, it is recommended to
  140. specify the keyword).
  141. .. versionadded:: 2.1.0
  142. The ``include_m`` parameter was added to support XYM (3D) and
  143. XYZM (4D) geometries available with GEOS 3.12.0 or later.
  144. With older GEOS versions, M dimension coordinates will be NaN.
  145. Returns
  146. -------
  147. tuple of (geometry_type, coords, offsets)
  148. geometry_type : GeometryType
  149. The type of the input geometries (required information for
  150. roundtrip).
  151. coords : np.ndarray
  152. Contiguous array of shape (n, 2), (n, 3), or (n, 4) of all
  153. coordinates of all input geometries.
  154. offsets: tuple of np.ndarray
  155. Offset arrays that make it possible to reconstruct the
  156. geometries from the flat coordinates array. The number of
  157. offset arrays depends on the geometry type. See
  158. https://github.com/geoarrow/geoarrow/blob/main/format.md
  159. for details.
  160. Uses int32 dtype offsets if possible, otherwise int64 for
  161. large inputs (coordinates > 32GB).
  162. Notes
  163. -----
  164. Mixed singular and multi geometry types of the same basic type are
  165. allowed (e.g., Point and MultiPoint) and all singular types will be
  166. treated as multi types.
  167. GeometryCollections and other mixed geometry types are not supported.
  168. See Also
  169. --------
  170. from_ragged_array
  171. Examples
  172. --------
  173. Consider a Polygon with one hole (interior ring):
  174. >>> import shapely
  175. >>> from shapely import Polygon
  176. >>> polygon = Polygon(
  177. ... [(0, 0), (10, 0), (10, 10), (0, 10)],
  178. ... holes=[[(2, 2), (3, 2), (2, 3)]]
  179. ... )
  180. >>> polygon
  181. <POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 2 3, 2 2))>
  182. This polygon can be thought of as a list of rings (first ring is the
  183. exterior ring, subsequent rings are the interior rings), and each ring
  184. as a list of coordinate pairs. This is very similar to how GeoJSON
  185. represents the coordinates:
  186. >>> import json
  187. >>> json.loads(shapely.to_geojson(polygon))["coordinates"]
  188. [[[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0], [0.0, 0.0]],
  189. [[2.0, 2.0], [3.0, 2.0], [2.0, 3.0], [2.0, 2.0]]]
  190. This function will return a similar list of lists of lists, but
  191. using a single contiguous array of coordinates, and multiple arrays of
  192. offsets:
  193. >>> geometry_type, coords, offsets = shapely.to_ragged_array([polygon])
  194. >>> geometry_type
  195. <GeometryType.POLYGON: 3>
  196. >>> coords
  197. array([[ 0., 0.],
  198. [10., 0.],
  199. [10., 10.],
  200. [ 0., 10.],
  201. [ 0., 0.],
  202. [ 2., 2.],
  203. [ 3., 2.],
  204. [ 2., 3.],
  205. [ 2., 2.]])
  206. >>> offsets
  207. (array([0, 5, 9], dtype=int32), array([0, 2], dtype=int32))
  208. As an example how to interpret the offsets: the i-th ring in the
  209. coordinates is represented by ``offsets[0][i]`` to ``offsets[0][i+1]``:
  210. >>> exterior_ring_start, exterior_ring_end = offsets[0][0], offsets[0][1]
  211. >>> coords[exterior_ring_start:exterior_ring_end]
  212. array([[ 0., 0.],
  213. [10., 0.],
  214. [10., 10.],
  215. [ 0., 10.],
  216. [ 0., 0.]])
  217. """
  218. from shapely import has_m, has_z # avoid circular import
  219. geometries = np.asarray(geometries)
  220. if include_z is None:
  221. include_z = np.any(has_z(geometries[~is_empty(geometries)]))
  222. if include_m is None:
  223. if _geos_ge_312:
  224. include_m = np.any(has_m(geometries[~is_empty(geometries)]))
  225. else:
  226. include_m = False
  227. geom_types = np.unique(get_type_id(geometries))
  228. # ignore missing values (type of -1)
  229. geom_types = geom_types[geom_types >= 0]
  230. get_arrays_args = geometries, include_z, include_m
  231. if len(geom_types) == 1:
  232. typ = GeometryType(geom_types[0])
  233. if typ == GeometryType.POINT:
  234. coords, offsets = _get_arrays_point(*get_arrays_args)
  235. elif typ == GeometryType.LINESTRING:
  236. coords, offsets = _get_arrays_linestring(*get_arrays_args)
  237. elif typ == GeometryType.POLYGON:
  238. coords, offsets = _get_arrays_polygon(*get_arrays_args)
  239. elif typ == GeometryType.MULTIPOINT:
  240. coords, offsets = _get_arrays_multipoint(*get_arrays_args)
  241. elif typ == GeometryType.MULTILINESTRING:
  242. coords, offsets = _get_arrays_multilinestring(*get_arrays_args)
  243. elif typ == GeometryType.MULTIPOLYGON:
  244. coords, offsets = _get_arrays_multipolygon(*get_arrays_args)
  245. else:
  246. raise ValueError(f"Geometry type {typ.name} is not supported")
  247. elif len(geom_types) == 2:
  248. if set(geom_types) == {GeometryType.POINT, GeometryType.MULTIPOINT}:
  249. typ = GeometryType.MULTIPOINT
  250. coords, offsets = _get_arrays_multipoint(*get_arrays_args)
  251. elif set(geom_types) == {GeometryType.LINESTRING, GeometryType.MULTILINESTRING}:
  252. typ = GeometryType.MULTILINESTRING
  253. coords, offsets = _get_arrays_multilinestring(*get_arrays_args)
  254. elif set(geom_types) == {GeometryType.POLYGON, GeometryType.MULTIPOLYGON}:
  255. typ = GeometryType.MULTIPOLYGON
  256. coords, offsets = _get_arrays_multipolygon(*get_arrays_args)
  257. else:
  258. raise ValueError(
  259. "Geometry type combination is not supported "
  260. f"({[GeometryType(t).name for t in geom_types]})"
  261. )
  262. else:
  263. raise ValueError(
  264. "Geometry type combination is not supported "
  265. f"({[GeometryType(t).name for t in geom_types]})"
  266. )
  267. return typ, coords, offsets
  268. # # coords/offset arrays -> GEOS (from_ragged_array)
  269. def _point_from_flatcoords(coords):
  270. result = creation.points(coords)
  271. # Older versions of GEOS (<= 3.9) don't automatically convert NaNs
  272. # to empty points -> do manually
  273. empties = np.isnan(coords).all(axis=1)
  274. if empties.any():
  275. result[empties] = creation.empty(1, geom_type=GeometryType.POINT).item()
  276. return result
  277. def _multipoint_from_flatcoords(coords, offsets):
  278. # recreate points
  279. if len(offsets):
  280. coords = coords[offsets[0] :]
  281. points = creation.points(coords)
  282. # recreate multipoints
  283. multipoint_parts = np.diff(offsets)
  284. multipoint_indices = np.repeat(np.arange(len(multipoint_parts)), multipoint_parts)
  285. result = np.empty(len(offsets) - 1, dtype=object)
  286. result = creation.multipoints(points, indices=multipoint_indices, out=result)
  287. result[multipoint_parts == 0] = creation.empty(
  288. 1, geom_type=GeometryType.MULTIPOINT
  289. ).item()
  290. return result
  291. def _linestring_from_flatcoords(coords, offsets):
  292. # recreate linestrings
  293. if len(offsets):
  294. coords = coords[offsets[0] :]
  295. linestring_n = np.diff(offsets)
  296. linestring_indices = np.repeat(np.arange(len(linestring_n)), linestring_n)
  297. result = np.empty(len(offsets) - 1, dtype=object)
  298. result = creation.linestrings(coords, indices=linestring_indices, out=result)
  299. result[linestring_n == 0] = creation.empty(
  300. 1, geom_type=GeometryType.LINESTRING
  301. ).item()
  302. return result
  303. def _multilinestrings_from_flatcoords(coords, offsets1, offsets2):
  304. # ensure correct dtypes
  305. offsets1 = np.asarray(offsets1, dtype="int64")
  306. offsets2 = np.asarray(offsets2, dtype="int64")
  307. # recreate multilinestrings
  308. result = _from_ragged_array_multi_linear(
  309. coords, offsets1, offsets2, geometry_type=GeometryType.MULTILINESTRING
  310. )
  311. return result
  312. def _polygon_from_flatcoords(coords, offsets1, offsets2):
  313. # ensure correct dtypes
  314. offsets1 = np.asarray(offsets1, dtype="int64")
  315. offsets2 = np.asarray(offsets2, dtype="int64")
  316. # recreate polygons
  317. result = _from_ragged_array_multi_linear(
  318. coords, offsets1, offsets2, geometry_type=GeometryType.POLYGON
  319. )
  320. return result
  321. def _multipolygons_from_flatcoords(coords, offsets1, offsets2, offsets3):
  322. # ensure correct dtypes
  323. offsets1 = np.asarray(offsets1, dtype="int64")
  324. offsets2 = np.asarray(offsets2, dtype="int64")
  325. offsets3 = np.asarray(offsets3, dtype="int64")
  326. # recreate multipolygons
  327. result = _from_ragged_array_multipolygon(coords, offsets1, offsets2, offsets3)
  328. return result
  329. def from_ragged_array(geometry_type, coords, offsets=None):
  330. """Create geometries from a contiguous array of coordinates and offset arrays.
  331. This function creates geometries from the ragged array representation
  332. as returned by ``to_ragged_array``.
  333. This follows the in-memory layout of the variable size list arrays defined
  334. by Apache Arrow, as specified for geometries by the GeoArrow project:
  335. https://github.com/geoarrow/geoarrow.
  336. See :func:`to_ragged_array` for more details.
  337. Parameters
  338. ----------
  339. geometry_type : GeometryType
  340. The type of geometry to create.
  341. coords : np.ndarray
  342. Contiguous array of shape (n, 2) or (n, 3) of all coordinates
  343. for the geometries.
  344. offsets: tuple of np.ndarray
  345. Offset arrays that allow to reconstruct the geometries based on the
  346. flat coordinates array. The number of offset arrays depends on the
  347. geometry type. See
  348. https://github.com/geoarrow/geoarrow/blob/main/format.md for details.
  349. Returns
  350. -------
  351. np.ndarray
  352. Array of geometries (1-dimensional).
  353. See Also
  354. --------
  355. to_ragged_array
  356. """
  357. coords = np.asarray(coords, dtype="float64")
  358. if geometry_type == GeometryType.POINT:
  359. if not (offsets is None or len(offsets) == 0):
  360. raise ValueError("'offsets' should not be provided for geometry type Point")
  361. return _point_from_flatcoords(coords)
  362. if offsets is None:
  363. raise ValueError(
  364. "'offsets' must be provided for any geometry type except for Point"
  365. )
  366. if geometry_type == GeometryType.LINESTRING:
  367. return _linestring_from_flatcoords(coords, *offsets)
  368. elif geometry_type == GeometryType.POLYGON:
  369. return _polygon_from_flatcoords(coords, *offsets)
  370. elif geometry_type == GeometryType.MULTIPOINT:
  371. return _multipoint_from_flatcoords(coords, *offsets)
  372. elif geometry_type == GeometryType.MULTILINESTRING:
  373. return _multilinestrings_from_flatcoords(coords, *offsets)
  374. elif geometry_type == GeometryType.MULTIPOLYGON:
  375. return _multipolygons_from_flatcoords(coords, *offsets)
  376. else:
  377. raise ValueError(f"Geometry type {geometry_type.name} is not supported")