| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475 |
- """Provides a conversion to / from a ragged array representation of geometries.
- A ragged (or "jagged") array is an irregular array of arrays of which each
- element can have a different length. As a result, such an array cannot be
- represented as a standard, rectangular nD array.
- The coordinates of geometries can be represented as arrays of arrays of
- coordinate pairs (possibly multiple levels of nesting, depending on the
- geometry type).
- Geometries, as a ragged array of coordinates, can be efficiently represented
- as contiguous arrays of coordinates provided that there is another data
- structure that keeps track of which range of coordinate values corresponds
- to a given geometry. This can be done using offsets, counts, or indices.
- This module currently implements offsets into the coordinates array. This
- is the ragged array representation defined by the the Apache Arrow project
- as "variable size list array" (https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout).
- See for example https://cfconventions.org/Data/cf-conventions/cf-conventions-1.9/cf-conventions.html#representations-features
- for different options.
- The exact usage of the Arrow list array with varying degrees of nesting for the
- different geometry types is defined by the GeoArrow project:
- https://github.com/geoarrow/geoarrow
- """
- import numpy as np
- from shapely import creation, geos_version
- from shapely._geometry import (
- GeometryType,
- get_parts,
- get_rings,
- get_type_id,
- )
- from shapely._geometry_helpers import (
- _from_ragged_array_multi_linear,
- _from_ragged_array_multipolygon,
- )
- from shapely.coordinates import get_coordinates
- from shapely.predicates import is_empty, is_missing
- __all__ = ["from_ragged_array", "to_ragged_array"]
- _geos_ge_312 = geos_version >= (3, 12, 0)
- # # GEOS -> coords/offset arrays (to_ragged_array)
- def _get_arrays_point(arr, include_z, include_m):
- # only one array of coordinates
- coords = get_coordinates(arr, include_z=include_z, include_m=include_m)
- # empty points are represented by NaNs
- # + missing geometries should also be present with some value
- empties = is_empty(arr) | is_missing(arr)
- if empties.any():
- indices = np.nonzero(empties)[0]
- indices = indices - np.arange(len(indices))
- coords = np.insert(coords, indices, np.nan, axis=0)
- return coords, ()
- def _indices_to_offsets(indices, n):
- # default to int32 offsets if possible (to prefer the non-large arrow list variants)
- # n_coords is the length of the array the indices are poin
- if len(indices) > 2147483647:
- dtype = np.int64
- else:
- dtype = np.int32
- offsets = np.insert(np.bincount(indices).cumsum(dtype=dtype), 0, 0)
- if len(offsets) != n + 1:
- # last geometries might be empty or missing
- offsets = np.pad(
- offsets,
- (0, n + 1 - len(offsets)),
- "constant",
- constant_values=offsets[-1],
- )
- return offsets
- def _get_arrays_multipoint(arr, include_z, include_m):
- # explode/flatten the MultiPoints
- _, part_indices = get_parts(arr, return_index=True)
- # the offsets into the multipoint parts
- offsets = _indices_to_offsets(part_indices, len(arr))
- # only one array of coordinates
- coords = get_coordinates(arr, include_z=include_z, include_m=include_m)
- return coords, (offsets,)
- def _get_arrays_linestring(arr, include_z, include_m):
- # the coords and offsets into the coordinates of the linestrings
- coords, indices = get_coordinates(
- arr, return_index=True, include_z=include_z, include_m=include_m
- )
- offsets = _indices_to_offsets(indices, len(arr))
- return coords, (offsets,)
- def _get_arrays_multilinestring(arr, include_z, include_m):
- # explode/flatten the MultiLineStrings
- arr_flat, part_indices = get_parts(arr, return_index=True)
- # the offsets into the multilinestring parts
- offsets2 = _indices_to_offsets(part_indices, len(arr))
- # the coords and offsets into the coordinates of the linestrings
- coords, indices = get_coordinates(
- arr_flat, return_index=True, include_z=include_z, include_m=include_m
- )
- offsets1 = _indices_to_offsets(indices, len(arr_flat))
- return coords, (offsets1, offsets2)
- def _get_arrays_polygon(arr, include_z, include_m):
- # explode/flatten the Polygons into Rings
- arr_flat, ring_indices = get_rings(arr, return_index=True)
- # the offsets into the exterior/interior rings of the multipolygon parts
- offsets2 = _indices_to_offsets(ring_indices, len(arr))
- # the coords and offsets into the coordinates of the rings
- coords, indices = get_coordinates(
- arr_flat, return_index=True, include_z=include_z, include_m=include_m
- )
- offsets1 = _indices_to_offsets(indices, len(arr_flat))
- return coords, (offsets1, offsets2)
- def _get_arrays_multipolygon(arr, include_z, include_m):
- # explode/flatten the MultiPolygons
- arr_flat, part_indices = get_parts(arr, return_index=True)
- # the offsets into the multipolygon parts
- offsets3 = _indices_to_offsets(part_indices, len(arr))
- # explode/flatten the Polygons into Rings
- arr_flat2, ring_indices = get_rings(arr_flat, return_index=True)
- # the offsets into the exterior/interior rings of the multipolygon parts
- offsets2 = _indices_to_offsets(ring_indices, len(arr_flat))
- # the coords and offsets into the coordinates of the rings
- coords, indices = get_coordinates(
- arr_flat2, return_index=True, include_z=include_z, include_m=include_m
- )
- offsets1 = _indices_to_offsets(indices, len(arr_flat2))
- return coords, (offsets1, offsets2, offsets3)
- def to_ragged_array(geometries, include_z=None, include_m=None):
- """Convert geometries to a ragged array representation.
- This function converts an array of geometries to a ragged array
- (i.e. irregular array of arrays) of coordinates, represented in memory
- using a single contiguous array of the coordinates, and
- up to 3 offset arrays that keep track where each sub-array
- starts and ends.
- This follows the in-memory layout of the variable size list arrays defined
- by Apache Arrow, as specified for geometries by the GeoArrow project:
- https://github.com/geoarrow/geoarrow.
- Parameters
- ----------
- geometries : array_like
- Array of geometries (1-dimensional).
- include_z, include_m : bool, default None
- If both are False, return XY (2D) geometries.
- If both are True, return XYZM (4D) geometries.
- If either is True, return either XYZ or XYM (3D) geometries.
- If a geometry has no Z or M dimension, extra coordinate data will be NaN.
- By default, will infer the dimensionality from the
- input geometries. Note that this inference can be unreliable with
- empty geometries (for a guaranteed result, it is recommended to
- specify the keyword).
- .. versionadded:: 2.1.0
- The ``include_m`` parameter was added to support XYM (3D) and
- XYZM (4D) geometries available with GEOS 3.12.0 or later.
- With older GEOS versions, M dimension coordinates will be NaN.
- Returns
- -------
- tuple of (geometry_type, coords, offsets)
- geometry_type : GeometryType
- The type of the input geometries (required information for
- roundtrip).
- coords : np.ndarray
- Contiguous array of shape (n, 2), (n, 3), or (n, 4) of all
- coordinates of all input geometries.
- offsets: tuple of np.ndarray
- Offset arrays that make it possible to reconstruct the
- geometries from the flat coordinates array. The number of
- offset arrays depends on the geometry type. See
- https://github.com/geoarrow/geoarrow/blob/main/format.md
- for details.
- Uses int32 dtype offsets if possible, otherwise int64 for
- large inputs (coordinates > 32GB).
- Notes
- -----
- Mixed singular and multi geometry types of the same basic type are
- allowed (e.g., Point and MultiPoint) and all singular types will be
- treated as multi types.
- GeometryCollections and other mixed geometry types are not supported.
- See Also
- --------
- from_ragged_array
- Examples
- --------
- Consider a Polygon with one hole (interior ring):
- >>> import shapely
- >>> from shapely import Polygon
- >>> polygon = Polygon(
- ... [(0, 0), (10, 0), (10, 10), (0, 10)],
- ... holes=[[(2, 2), (3, 2), (2, 3)]]
- ... )
- >>> polygon
- <POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 2 3, 2 2))>
- This polygon can be thought of as a list of rings (first ring is the
- exterior ring, subsequent rings are the interior rings), and each ring
- as a list of coordinate pairs. This is very similar to how GeoJSON
- represents the coordinates:
- >>> import json
- >>> json.loads(shapely.to_geojson(polygon))["coordinates"]
- [[[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0], [0.0, 0.0]],
- [[2.0, 2.0], [3.0, 2.0], [2.0, 3.0], [2.0, 2.0]]]
- This function will return a similar list of lists of lists, but
- using a single contiguous array of coordinates, and multiple arrays of
- offsets:
- >>> geometry_type, coords, offsets = shapely.to_ragged_array([polygon])
- >>> geometry_type
- <GeometryType.POLYGON: 3>
- >>> coords
- array([[ 0., 0.],
- [10., 0.],
- [10., 10.],
- [ 0., 10.],
- [ 0., 0.],
- [ 2., 2.],
- [ 3., 2.],
- [ 2., 3.],
- [ 2., 2.]])
- >>> offsets
- (array([0, 5, 9], dtype=int32), array([0, 2], dtype=int32))
- As an example how to interpret the offsets: the i-th ring in the
- coordinates is represented by ``offsets[0][i]`` to ``offsets[0][i+1]``:
- >>> exterior_ring_start, exterior_ring_end = offsets[0][0], offsets[0][1]
- >>> coords[exterior_ring_start:exterior_ring_end]
- array([[ 0., 0.],
- [10., 0.],
- [10., 10.],
- [ 0., 10.],
- [ 0., 0.]])
- """
- from shapely import has_m, has_z # avoid circular import
- geometries = np.asarray(geometries)
- if include_z is None:
- include_z = np.any(has_z(geometries[~is_empty(geometries)]))
- if include_m is None:
- if _geos_ge_312:
- include_m = np.any(has_m(geometries[~is_empty(geometries)]))
- else:
- include_m = False
- geom_types = np.unique(get_type_id(geometries))
- # ignore missing values (type of -1)
- geom_types = geom_types[geom_types >= 0]
- get_arrays_args = geometries, include_z, include_m
- if len(geom_types) == 1:
- typ = GeometryType(geom_types[0])
- if typ == GeometryType.POINT:
- coords, offsets = _get_arrays_point(*get_arrays_args)
- elif typ == GeometryType.LINESTRING:
- coords, offsets = _get_arrays_linestring(*get_arrays_args)
- elif typ == GeometryType.POLYGON:
- coords, offsets = _get_arrays_polygon(*get_arrays_args)
- elif typ == GeometryType.MULTIPOINT:
- coords, offsets = _get_arrays_multipoint(*get_arrays_args)
- elif typ == GeometryType.MULTILINESTRING:
- coords, offsets = _get_arrays_multilinestring(*get_arrays_args)
- elif typ == GeometryType.MULTIPOLYGON:
- coords, offsets = _get_arrays_multipolygon(*get_arrays_args)
- else:
- raise ValueError(f"Geometry type {typ.name} is not supported")
- elif len(geom_types) == 2:
- if set(geom_types) == {GeometryType.POINT, GeometryType.MULTIPOINT}:
- typ = GeometryType.MULTIPOINT
- coords, offsets = _get_arrays_multipoint(*get_arrays_args)
- elif set(geom_types) == {GeometryType.LINESTRING, GeometryType.MULTILINESTRING}:
- typ = GeometryType.MULTILINESTRING
- coords, offsets = _get_arrays_multilinestring(*get_arrays_args)
- elif set(geom_types) == {GeometryType.POLYGON, GeometryType.MULTIPOLYGON}:
- typ = GeometryType.MULTIPOLYGON
- coords, offsets = _get_arrays_multipolygon(*get_arrays_args)
- else:
- raise ValueError(
- "Geometry type combination is not supported "
- f"({[GeometryType(t).name for t in geom_types]})"
- )
- else:
- raise ValueError(
- "Geometry type combination is not supported "
- f"({[GeometryType(t).name for t in geom_types]})"
- )
- return typ, coords, offsets
- # # coords/offset arrays -> GEOS (from_ragged_array)
- def _point_from_flatcoords(coords):
- result = creation.points(coords)
- # Older versions of GEOS (<= 3.9) don't automatically convert NaNs
- # to empty points -> do manually
- empties = np.isnan(coords).all(axis=1)
- if empties.any():
- result[empties] = creation.empty(1, geom_type=GeometryType.POINT).item()
- return result
- def _multipoint_from_flatcoords(coords, offsets):
- # recreate points
- if len(offsets):
- coords = coords[offsets[0] :]
- points = creation.points(coords)
- # recreate multipoints
- multipoint_parts = np.diff(offsets)
- multipoint_indices = np.repeat(np.arange(len(multipoint_parts)), multipoint_parts)
- result = np.empty(len(offsets) - 1, dtype=object)
- result = creation.multipoints(points, indices=multipoint_indices, out=result)
- result[multipoint_parts == 0] = creation.empty(
- 1, geom_type=GeometryType.MULTIPOINT
- ).item()
- return result
- def _linestring_from_flatcoords(coords, offsets):
- # recreate linestrings
- if len(offsets):
- coords = coords[offsets[0] :]
- linestring_n = np.diff(offsets)
- linestring_indices = np.repeat(np.arange(len(linestring_n)), linestring_n)
- result = np.empty(len(offsets) - 1, dtype=object)
- result = creation.linestrings(coords, indices=linestring_indices, out=result)
- result[linestring_n == 0] = creation.empty(
- 1, geom_type=GeometryType.LINESTRING
- ).item()
- return result
- def _multilinestrings_from_flatcoords(coords, offsets1, offsets2):
- # ensure correct dtypes
- offsets1 = np.asarray(offsets1, dtype="int64")
- offsets2 = np.asarray(offsets2, dtype="int64")
- # recreate multilinestrings
- result = _from_ragged_array_multi_linear(
- coords, offsets1, offsets2, geometry_type=GeometryType.MULTILINESTRING
- )
- return result
- def _polygon_from_flatcoords(coords, offsets1, offsets2):
- # ensure correct dtypes
- offsets1 = np.asarray(offsets1, dtype="int64")
- offsets2 = np.asarray(offsets2, dtype="int64")
- # recreate polygons
- result = _from_ragged_array_multi_linear(
- coords, offsets1, offsets2, geometry_type=GeometryType.POLYGON
- )
- return result
- def _multipolygons_from_flatcoords(coords, offsets1, offsets2, offsets3):
- # ensure correct dtypes
- offsets1 = np.asarray(offsets1, dtype="int64")
- offsets2 = np.asarray(offsets2, dtype="int64")
- offsets3 = np.asarray(offsets3, dtype="int64")
- # recreate multipolygons
- result = _from_ragged_array_multipolygon(coords, offsets1, offsets2, offsets3)
- return result
- def from_ragged_array(geometry_type, coords, offsets=None):
- """Create geometries from a contiguous array of coordinates and offset arrays.
- This function creates geometries from the ragged array representation
- as returned by ``to_ragged_array``.
- This follows the in-memory layout of the variable size list arrays defined
- by Apache Arrow, as specified for geometries by the GeoArrow project:
- https://github.com/geoarrow/geoarrow.
- See :func:`to_ragged_array` for more details.
- Parameters
- ----------
- geometry_type : GeometryType
- The type of geometry to create.
- coords : np.ndarray
- Contiguous array of shape (n, 2) or (n, 3) of all coordinates
- for the geometries.
- offsets: tuple of np.ndarray
- Offset arrays that allow to reconstruct the geometries based on the
- flat coordinates array. The number of offset arrays depends on the
- geometry type. See
- https://github.com/geoarrow/geoarrow/blob/main/format.md for details.
- Returns
- -------
- np.ndarray
- Array of geometries (1-dimensional).
- See Also
- --------
- to_ragged_array
- """
- coords = np.asarray(coords, dtype="float64")
- if geometry_type == GeometryType.POINT:
- if not (offsets is None or len(offsets) == 0):
- raise ValueError("'offsets' should not be provided for geometry type Point")
- return _point_from_flatcoords(coords)
- if offsets is None:
- raise ValueError(
- "'offsets' must be provided for any geometry type except for Point"
- )
- if geometry_type == GeometryType.LINESTRING:
- return _linestring_from_flatcoords(coords, *offsets)
- elif geometry_type == GeometryType.POLYGON:
- return _polygon_from_flatcoords(coords, *offsets)
- elif geometry_type == GeometryType.MULTIPOINT:
- return _multipoint_from_flatcoords(coords, *offsets)
- elif geometry_type == GeometryType.MULTILINESTRING:
- return _multilinestrings_from_flatcoords(coords, *offsets)
- elif geometry_type == GeometryType.MULTIPOLYGON:
- return _multipolygons_from_flatcoords(coords, *offsets)
- else:
- raise ValueError(f"Geometry type {geometry_type.name} is not supported")
|