8000 feat: add `bigframes.bigquery.st_distance` function by tswast · Pull Request #1637 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
8000

feat: add bigframes.bigquery.st_distance function #1637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 22, 2025
Merged
8 changes: 7 additions & 1 deletion bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@
unix_millis,
unix_seconds,
)
from bigframes.bigquery._operations.geo import st_area, st_difference, st_intersection
from bigframes.bigquery._operations.geo import (
st_area,
st_difference,
st_distance,
st_intersection,
)
from bigframes.bigquery._operations.json import (
json_extract,
json_extract_array,
Expand All @@ -49,6 +54,7 @@
# geo ops
"st_area",
"st_difference",
"st_distance",
"st_intersection",
# json ops
"json_set",
Expand Down
161 changes: 116 additions & 45 deletions bigframes/bigquery/_operations/geo.py
A3DB
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@

from __future__ import annotations

from typing import Union

import shapely # type: ignore

from bigframes import operations as ops
import bigframes.dtypes
import bigframes.geopandas
import bigframes.series

Expand All @@ -25,7 +28,9 @@
"""


def st_area(series: bigframes.series.Series) -> bigframes.series.Series:
def st_area(
series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
) -> bigframes.series.Series:
"""
Returns the area in square meters covered by the polygons in the input
`GEOGRAPHY`.
Expand Down Expand Up @@ -85,6 +90,10 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series:
4 0.0
dtype: Float64

Args:
series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries):
A series containing geography objects.

Returns:
bigframes.pandas.Series:
Series of float representing the areas.
Expand All @@ -95,7 +104,10 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series:


def st_difference(
series: bigframes.series.Series, other: bigframes.series.Series
series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
other: Union[
bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry
],
) -> bigframes.series.Series:
"""
Returns a `GEOGRAPHY` that represents the point set difference of
Expand Down Expand Up @@ -166,44 +178,23 @@ def st_difference(
5 None
dtype: geometry

We can also check difference of single shapely geometries:

>>> polygon_s1 = bigframes.geopandas.GeoSeries(
... [
... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])
... ]
... )
>>> polygon_s2 = bigframes.geopandas.GeoSeries(
... [
... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)])
... ]
... )

>>> polygon_s1
0 POLYGON ((0 0, 10 0, 10 10, 0 0))
dtype: geometry

>>> polygon_s2
0 POLYGON ((4 2, 6 2, 8 6, 4 2))
dtype: geometry

>>> bbq.st_difference(polygon_s1, polygon_s2)
0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4...
dtype: geometry

Additionally, we can check difference of a GeoSeries against a single shapely geometry:

>>> bbq.st_difference(s1, polygon_s2)
0 POLYGON ((0 0, 2 2, 0 2, 0 0))
1 None
2 None
3 None
4 None
>>> polygon = Polygon([(0, 0), (10, 0), (10, 10), (0, 0)])
>>> bbq.st_difference(s1, polygon)
0 POLYGON ((1.97082 2.00002, 0 2, 0 0, 1.97082 2...
1 POLYGON ((1.97082 2.00002, 0 2, 0 0, 1.97082 2...
2 GEOMETRYCOLLECTION EMPTY
3 LINESTRING (0.99265 1.00781, 0 2)
4 POINT (0 1)
dtype: geometry

Args:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Miss series argument here.

other (bigframes.series.Series or geometric object):
The GeoSeries (elementwise) or geometric object to find the difference to.
series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries):
A series containing geography objects.
other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry):
The series or geometric object to subtract from the geography
objects in ``series``.

Returns:
bigframes.series.Series:
Expand All @@ -213,8 +204,86 @@ def st_difference(
return series._apply_binary_op(other, ops.geo_st_difference_op)


def st_distance(
series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
other: Union[
bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry
],
*,
use_spheroid: bool = False,
) -> bigframes.series.Series:
"""
Returns the shortest distance in meters between two non-empty
``GEOGRAPHY`` objects.

**Examples:**

>>> import bigframes as bpd
>>> import bigframes.bigquery as bbq
>>> import bigframes.geopandas
>>> from shapely.geometry import Polygon, LineString, Point
>>> bpd.options.display.progress_bar = None

We can check two GeoSeries against each other, row by row.

>>> s1 = bigframes.geopandas.GeoSeries(
... [
... Point(0, 0),
... Point(0.00001, 0),
... Point(0.00002, 0),
... ],
... )
>>> s2 = bigframes.geopandas.GeoSeries(
... [
... Point(0.00001, 0),
... Point(0.00003, 0),
... Point(0.00005, 0),
... ],
... )

>>> bbq.st_distance(s1, s2, use_spheroid=True)
0 1.113195
1 2.22639
2 3.339585
dtype: Float64

We can also calculate the distance of each geometry and a single shapely geometry:

>>> bbq.st_distance(s2, Point(0.00001, 0))
0 0.0
1 2.223902
2 4.447804
dtype: Float64

Args:
series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries):
A series containing geography objects.
other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry):
The series or geometric object to calculate the distance in meters
to from the geography objects in ``series``.
use_spheroid (optional, default ``False``):
Determines how this function measures distance. If ``use_spheroid``
is False, the function measures distance on the surface of a perfect
sphere. If ``use_spheroid`` is True, the function measures distance
on the surface of the `WGS84 spheroid
<https://cloud.google.com/bigquery/docs/geospatial-data>`_. The
default value of ``use_spheroid`` is False.

Returns:
bigframes.pandas.Series:
The Series (elementwise) of the smallest distance between
each aligned geometry with other.
"""
return series._apply_binary_op(
other, ops.GeoStDistanceOp(use_spheroid=use_spheroid)
)


def st_intersection(
series: bigframes.series.Series, other: bigframes.series.Series
series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
other: Union[
bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry
],
) -> bigframes.series.Series:
"""
Returns a `GEOGRAPHY` that represents the point set intersection of the two
Expand Down Expand Up @@ -284,18 +353,20 @@ def st_intersection(

We can also do intersection of each geometry and a single shapely geometry:

>>> bbq.st_intersection(s1, bigframes.geopandas.GeoSeries([Polygon([(0, 0), (1, 1), (0, 1)])]))
>>> bbq.st_intersection(s1, Polygon([(0, 0), (1, 1), (0, 1)]))
0 POLYGON ((0 0, 0.99954 1, 0 1, 0 0))
1 None
2 None
3 None
4 None
1 POLYGON ((0 0, 0.99954 1, 0 1, 0 0))
2 LINESTRING (0 0, 0.99954 1)
3 GEOMETRYCOLLECTION EMPTY
4 POINT (0 1)
dtype: geometry

Args:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing series argument here.

other (GeoSeries or geometric object):
The Geoseries (elementwise) or geometric object to find the
intersection with.
series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries):
A series containing geography objects.
other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry):
The series or geometric object to intersect with the geography
o F438 bjects in ``series``.

Returns:
bigframes.geopandas.GeoSeries:
Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,13 @@ def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value):
)


@scalar_op_compiler.register_binary_op(ops.GeoStDistanceOp, pass_op=True)
def geo_st_distance_op_impl(
x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp
):
return st_distance(x, y, op.use_spheroid)


@scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op)
def geo_st_geogfromtext_op_impl(x: ibis_types.Value):
# Ibis doesn't seem to provide a dedicated method to cast from string to geography,
Expand Down Expand Up @@ -1989,6 +1996,11 @@ def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ign
"""Find the boundary of a geography."""


@ibis_udf.scalar.builtin
def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore
"""Convert string to geography."""


@ibis_udf.scalar.builtin
def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore
"""Convert a timestamp to microseconds"""
Expand Down
24 changes: 13 additions & 11 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,30 +586,32 @@ def _is_bigframes_dtype(dtype) -> bool:
return False


def _infer_dtype_from_python_type(type: type) -> Dtype:
if type in (datetime.timedelta, pd.Timedelta, np.timedelta64):
def _infer_dtype_from_python_type(type_: type) -> Dtype:
if type_ in (datetime.timedelta, pd.Timedelta, np.timedelta64):
# Must check timedelta type first. Otherwise other branchs will be evaluated to true
# E.g. np.timedelta64 is a sublcass as np.integer
return TIMEDELTA_DTYPE
if issubclass(type, (bool, np.bool_)):
if issubclass(type_, (bool, np.bool_)):
return BOOL_DTYPE
if issubclass(type, (int, np.integer)):
if issubclass(type_, (int, np.integer)):
return INT_DTYPE
if issubclass(type, (float, np.floating)):
if issubclass(type_, (float, np.floating)):
return FLOAT_DTYPE
if issubclass(type, decimal.Decimal):
if issubclass(type_, decimal.Decimal):
return NUMERIC_DTYPE
if issubclass(type, (str, np.str_)):
if issubclass(type_, (str, np.str_)):
return STRING_DTYPE
if issubclass(type, (bytes, np.bytes_)):
if issubclass(type_, (bytes, np.bytes_)):
return BYTES_DTYPE
if issubclass(type, datetime.date):
if issubclass(type_, datetime.date):
return DATE_DTYPE
if issubclass(type, datetime.time):
if issubclass(type_, datetime.time):
return TIME_DTYPE
if issubclass(type_, shapely.Geometry):
return GEO_DTYPE
else:
raise TypeError(
f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
f"No matching datatype for python type: {type_}. {constants.FEEDBACK_LINK}"
)


Expand Down
22 changes: 5 additions & 17 deletions bigframes/geopandas/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,6 @@ def y(self) -> bigframes.series.Series:
# we can.
@property
def area(self, crs=None) -> bigframes.series.Series: # type: ignore
"""Returns a Series containing the area of each geometry in the GeoSeries
expressed in the units of the CRS.

Args:
crs (optional):
Coordinate Reference System of the geometry objects. Can be
anything accepted by pyproj.CRS.from_user_input(), such as an
authority string (eg “EPSG:4326”) or a WKT string.

Returns:
bigframes.pandas.Series:
Series of float representing the areas.

Raises:
NotImplementedError:
GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead.
"""
raise NotImplementedError(
f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}"
)
Expand Down Expand Up @@ -97,5 +80,10 @@ def to_wkt(self: GeoSeries) -> bigframes.series.Series:
def difference(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore
return self._apply_binary_op(other, ops.geo_st_difference_op)

def distance(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore
raise NotImplementedError(
f"GeoSeries.distance is not supported. Use bigframes.bigquery.st_distance(series, other), instead. {constants.FEEDBACK_LINK}"
)

def intersection(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore
return self._apply_binary_op(other, ops.geo_st_intersection_op)
2 changes: 2 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
geo_st_intersection_op,
geo_x_op,
geo_y_op,
GeoStDistanceOp,
)
from bigframes.operations.json_ops import (
JSONExtract,
Expand Down Expand Up @@ -375,6 +376,7 @@
"geo_st_intersection_op",
"geo_x_op",
"geo_y_op",
"GeoStDistanceOp",
# Numpy ops mapping
"NUMPY_TO_BINOP",
"NUMPY_TO_OP",
Expand Down
11 changes: 11 additions & 0 deletions bigframes/operations/geo_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses

from bigframes import dtypes
from bigframes.operations import base_ops
import bigframes.operations.type as op_typing
Expand Down Expand Up @@ -69,3 +71,12 @@
geo_st_intersection_op = base_ops.create_binary_op(
name="geo_st_intersection", type_signature=op_typing.BinaryGeo()
)


@dataclasses.dataclass(frozen=True)
class GeoStDistanceOp(base_ops.BinaryOp):
name = "st_distance"
use_spheroid: bool

def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
return dtypes.FLOAT_DTYPE
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"pyarrow >=15.0.2",
"pydata-google-auth >=1.8.2",
"requests >=2.27.1",
"shapely >=1.8.5",
"shapely >=2.0.0",
"sqlglot >=23.6.3",
"tabulate >=0.9",
"ipywidgets >=7.7.1",
Expand Down
Loading
0