diff --git a/docs/source/publishing/ogcapi-features.rst b/docs/source/publishing/ogcapi-features.rst index 3a1c2e9e5..b44f8c251 100644 --- a/docs/source/publishing/ogcapi-features.rst +++ b/docs/source/publishing/ogcapi-features.rst @@ -450,7 +450,7 @@ Connection password: geo_test # external_auth: wallet # tns_name: XEPDB1 - # tns_admin /opt/oracle/client/network/admin + # tns_admin /opt/oracle/client/network/admin # init_oracle_client: True id_field: id @@ -458,7 +458,7 @@ Connection geom_field: geometry title_field: name -The provider supports connection over host and port with SID, SERVICE_NAME or TNS_NAME. For TNS naming, the system +The provider supports connection over host and port with SID, SERVICE_NAME or TNS_NAME. For TNS naming, the system environment variable TNS_ADMIN or the configuration parameter tns_admin must be set. The providers supports external authentication. At the moment only wallet authentication is implemented. @@ -484,7 +484,7 @@ SDO options title_field: name sdo_operator: sdo_relate # defaults to sdo_filter sdo_param: mask=touch+coveredby # defaults to mask=anyinteract - + The provider supports two different SDO operators, sdo_filter and sdo_relate. When not set, the default is sdo_relate! Further more it is possible to set the sdo_param option. When sdo_relate is used the default is anyinteraction! `See Oracle Documentation for details `_. @@ -509,7 +509,7 @@ Mandatory properties mandatory_properties: - example_group_id -On large tables it could be useful to disallow a query on the complete dataset. For this reason it is possible to +On large tables it could be useful to disallow a query on the complete dataset. For this reason it is possible to configure mandatory properties. When this is activated, the provider throws an exception when the parameter is not in the query uri. @@ -556,13 +556,13 @@ Extra_params """""""""""" The Oracle provider allows for additional parameters that can be passed in the request. It allows for the processing of additional parameters that are not defined in the ``pygeoapi-config.yml`` to be passed to a custom SQL-Manipulator-Plugin. An example use case of this is advanced filtering without exposing the filtered columns like follows ``.../collections/some_data/items?is_recent=true``. The ``SqlManipulator`` plugin's ``process_query`` method would receive ``extra_params = {'is_recent': 'true'}`` and could dynamically add a custom condition to the SQL query, like ``AND SYSDATE - create_date < 30``. -The ``include_extra_query_parameters`` has to be set to ``true`` for the collection in ``pygeoapi-config.yml``. This ensures that the additional request parameters (e.g. ``is_recent=true``) are not discarded. +The ``include_extra_query_parameters`` has to be set to ``true`` for the collection in ``pygeoapi-config.yml``. This ensures that the additional request parameters (e.g. ``is_recent=true``) are not discarded. Custom SQL Manipulator Plugin """"""""""""""""""""""""""""" The provider supports a SQL-Manipulator-Plugin class. With this, the SQL statement could be manipulated. This is -useful e.g. for authorization at row level or manipulation of the explain plan with hints. +useful e.g. for authorization at row level or manipulation of the explain plan with hints. More information and examples about this feature can be found in ``tests/provider/test_oracle_provider.py``. @@ -584,14 +584,14 @@ To publish a GeoParquet file (with a geometry column) the geopandas package is a providers: - type: feature name: Parquet - data: + data: source: ./tests/data/parquet/random.parquet id_field: id time_field: time x_field: - minlon - maxlon - y_field: + y_field: - minlat - maxlat @@ -663,6 +663,24 @@ These are optional and if not specified, the default from the engine will be use table: hotosm_bdi_waterways geom_field: foo_geom +Due to PostgreSQL's unique multi read multi write strategy row counts for query’s that return a lot of rows can take significant amounts of time to complete. To address this issue the following settings can be configured on your PostgreSQL provider: + +.. csv-table:: + :header: Name, Type, Default Value, Description + :align: left + + postgresql_pseudo_count_enabled, Boolean, false, Enables pseudo count. + postgresql_pseudo_count_start, Integer, 5000000, Sets the minimum number of rows a table must have before a pseudo count is performed when pseudo counts are enabled. + +This solution uses the built in PostgreSQL EXPLAIN function to “guess” the number of rows a given query will return. If that value is greater than the postgresql_pseudo_count_start value, then the “guessed” value from the EXPLAIN function is returned in the response. If the “guessed” value is lower then, a full count is completed and the result returned in the response. These settings do not affect the following types of requests: + +* Requests with a Result Type of Hits. +* Requests with a CQL filter. +* Requests with a BBOX filter. +* Requests with a Temporal filter. + +Using these pseudo count options allows you to granularly configure each PostgreSQL provider to get the best performance out of your API. But when choosing weather or not to use these pseudo count settings understanding the trade-off you are making is important. By enabling pseudo counts you are choosing speed over accuracy, some pseudo counts maybe higher and others lower than the true row count for a given query. But, with some datasets this trade off might be better than not being able to provide the data at all. While these settings have default values pseudo row counts must be enabled for them to be used. + The PostgreSQL provider is also able to connect to Cloud SQL databases. .. code-block:: yaml diff --git a/pygeoapi/provider/sql.py b/pygeoapi/provider/sql.py index cba5abaea..524880769 100644 --- a/pygeoapi/provider/sql.py +++ b/pygeoapi/provider/sql.py @@ -57,6 +57,7 @@ from decimal import Decimal import functools import logging +from re import search from typing import Optional from geoalchemy2 import Geometry # noqa - this isn't used explicitly but is needed to process Geometry columns @@ -71,7 +72,8 @@ PrimaryKeyConstraint, asc, desc, - delete + delete, + text ) from sqlalchemy.engine import URL from sqlalchemy.exc import ( @@ -127,6 +129,12 @@ def __init__( self.id_field = provider_def['id_field'] self.geom = provider_def.get('geom_field', 'geom') self.driver_name = driver_name + self.postgresql_pseudo_count_enabled = provider_def.get( + 'postgresql_pseudo_count_enabled', False + ) + self.postgresql_pseudo_count_start = provider_def.get( + 'postgresql_pseudo_count_start', 5000000 + ) LOGGER.debug(f'Name: {self.name}') LOGGER.debug(f'Table: {self.table}') @@ -738,6 +746,121 @@ def _get_bbox_filter(self, bbox: list[float]): return bbox_filter + def query( + self, + offset=0, + limit=10, + resulttype='results', + bbox=[], + datetime_=None, + properties=[], + sortby=[], + select_properties=[], + skip_geometry=False, + q=None, + filterq=None, + crs_transform_spec=None, + **kwargs + ): + """ + Query sql database for all the content. + e,g: http://localhost:5000/collections/hotosm_bdi_waterways/items? + limit=1&resulttype=results + + :param offset: starting record to return (default 0) + :param limit: number of records to return (default 10) + :param resulttype: return results or hit limit (default results) + :param bbox: bounding box [minx,miny,maxx,maxy] + :param datetime_: temporal (datestamp or extent) + :param properties: list of tuples (name, value) + :param sortby: list of dicts (property, order) + :param select_properties: list of property names + :param skip_geometry: bool of whether to skip geometry (default False) + :param q: full-text search term(s) + :param filterq: CQL query as text string + :param crs_transform_spec: `CrsTransformSpec` instance, optional + + :returns: GeoJSON FeatureCollection + """ + + LOGGER.debug('Preparing filters') + property_filters = self._get_property_filters(properties) + cql_filters = self._get_cql_filters(filterq) + bbox_filter = self._get_bbox_filter(bbox) + time_filter = self._get_datetime_filter(datetime_) + order_by_clauses = self._get_order_by_clauses(sortby, self.table_model) + selected_properties = self._select_properties_clause( + select_properties, skip_geometry + ) + + LOGGER.debug('Querying Database') + # Execute query within self-closing database Session context + with Session(self._engine) as session: + results = ( + session.query(self.table_model) + .filter(property_filters) + .filter(cql_filters) + .filter(bbox_filter) + .filter(time_filter) + .options(selected_properties) + ) + + LOGGER.debug(f'PostgreSQL pseudo count enabled: {self.postgresql_pseudo_count_enabled}') # noqa + LOGGER.debug(f'PostgreSQL pseudo count start: {self.postgresql_pseudo_count_start}') # noqa + + if self.postgresql_pseudo_count_enabled: + # This if statement uses is not True for cql_filters, bbox_filter, and time_filter because even when no value is provided they are True. This is because an empty object is always set for each of these values if no value is provided by the user. # noqa + if resulttype == 'hits' or cql_filters is not True or bbox_filter is not True or time_filter is not True: # noqa + matched = results.count() + LOGGER.debug('Full count executed (hits or filters)') + else: + compiled_query = results.statement.compile( + self._engine, + compile_kwargs={"literal_binds": True} + ) + explain_query = f"EXPLAIN {compiled_query}" + query_explanation = session.execute(text(explain_query)) + explanation_overview = query_explanation.fetchone() + match = ( + search(r'rows=(\d+)', str(explanation_overview[0])) + if explanation_overview else "" + ) + matched = int(match.group(1)) if match else 0 + LOGGER.debug('Pseudo count executed') + + if matched < self.postgresql_pseudo_count_start: + matched = results.count() + LOGGER.debug('Full count executed (too few features)') + + else: + matched = results.count() + + LOGGER.debug(f'Found {matched} result(s)') + + LOGGER.debug('Preparing response') + response = { + 'type': 'FeatureCollection', + 'features': [], + 'numberMatched': matched, + 'numberReturned': 0 + } + + if resulttype == 'hits' or not results: + return response + + crs_transform_out = get_transform_from_spec(crs_transform_spec) + + for item in ( + results.order_by(*order_by_clauses).offset(offset).limit(limit) + ): + response['numberReturned'] += 1 + response['features'].append( + self._sqlalchemy_to_feature(item, crs_transform_out, + select_properties) + ) + + return response + class MySQLProvider(GenericSQLProvider): """ diff --git a/tests/data/dummy_data.sql b/tests/data/dummy_data.sql index d4755296a..67fb56403 100644 --- a/tests/data/dummy_data.sql +++ b/tests/data/dummy_data.sql @@ -6,37 +6,39 @@ CREATE EXTENSION IF NOT EXISTS postgis WITH SCHEMA dummy; CREATE TABLE IF NOT EXISTS dummy.buildings( gid serial PRIMARY KEY, centroid geometry(POINT, 25833), - contours geometry(POLYGON, 25833) + contours geometry(POLYGON, 25833), + building_type varchar, + datetime timestamp ); -INSERT INTO dummy.buildings(centroid, contours) +INSERT INTO dummy.buildings(centroid, contours, building_type, datetime) VALUES (ST_GeomFromText('POINT (473449 7463146)', 25833), - ST_GeomFromText('POLYGON ((473447.9967755177 7463140.685534775, 473453.51980463834 7463143.029921546, 473450.0032244818 7463151.314465227, 473444.4801953612 7463148.970078456, 473447.9967755177 7463140.685534775))', 25833)), + ST_GeomFromText('POLYGON ((473447.9967755177 7463140.685534775, 473453.51980463834 7463143.029921546, 473450.0032244818 7463151.314465227, 473444.4801953612 7463148.970078456, 473447.9967755177 7463140.685534775))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473458 7463104)', 25833), - ST_GeomFromText('POLYGON ((473460.9359104787 7463106.762323238, 473457.1106914547 7463107.931810057, 473455.06408952177 7463101.237676765, 473458.88930854574 7463100.068189946, 473460.9359104787 7463106.762323238))', 25833)), + ST_GeomFromText('POLYGON ((473460.9359104787 7463106.762323238, 473457.1106914547 7463107.931810057, 473455.06408952177 7463101.237676765, 473458.88930854574 7463100.068189946, 473460.9359104787 7463106.762323238))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473446 7463144)', 25833), - ST_GeomFromText('POLYGON ((473446.09474694915 7463138.853056925, 473450.31999101397 7463146.79958526, 473445.9052530499 7463149.146943075, 473441.6800089851 7463141.20041474, 473446.09474694915 7463138.853056925))', 25833)), + ST_GeomFromText('POLYGON ((473446.09474694915 7463138.853056925, 473450.31999101397 7463146.79958526, 473445.9052530499 7463149.146943075, 473441.6800089851 7463141.20041474, 473446.09474694915 7463138.853056925))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473449 7463142)', 25833), - ST_GeomFromText('POLYGON ((473452.3381955018 7463138.820935548, 473452.65221123956 7463144.812712757, 473445.6618044963 7463145.179064451, 473445.3477887586 7463139.187287242, 473452.3381955018 7463138.820935548))', 25833)), + ST_GeomFromText('POLYGON ((473452.3381955018 7463138.820935548, 473452.65221123956 7463144.812712757, 473445.6618044963 7463145.179064451, 473445.3477887586 7463139.187287242, 473452.3381955018 7463138.820935548))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473443 7463137)', 25833), - ST_GeomFromText('POLYGON ((473447.7083111685 7463135.5571535295, 473440.9159249468 7463141.46168479, 473438.2916888306 7463138.44284647, 473445.0840750523 7463132.538315209, 473447.7083111685 7463135.5571535295))', 25833)), + ST_GeomFromText('POLYGON ((473447.7083111685 7463135.5571535295, 473440.9159249468 7463141.46168479, 473438.2916888306 7463138.44284647, 473445.0840750523 7463132.538315209, 473447.7083111685 7463135.5571535295))', 25833), 'residential', '2021-11-23 09:00:00.000'), (ST_GeomFromText('POINT (473433 7463125)', 25833), - ST_GeomFromText('POLYGON ((473432.73905580025 7463120.082489641, 473436.8249702975 7463128.10154836, 473433.2609442007 7463129.917510359, 473429.1750297034 7463121.898451641, 473432.73905580025 7463120.082489641))', 25833)), + ST_GeomFromText('POLYGON ((473432.73905580025 7463120.082489641, 473436.8249702975 7463128.10154836, 473433.2609442007 7463129.917510359, 473429.1750297034 7463121.898451641, 473432.73905580025 7463120.082489641))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473451 7463140)', 25833), - ST_GeomFromText('POLYGON ((473454.99435667787 7463139.456755368, 473453.4959303038 7463143.165490787, 473447.00564332213 7463140.543244633, 473448.5040696962 7463136.834509214, 473454.99435667787 7463139.456755368))', 25833)), + ST_GeomFromText('POLYGON ((473454.99435667787 7463139.456755368, 473453.4959303038 7463143.165490787, 473447.00564332213 7463140.543244633, 473448.5040696962 7463136.834509214, 473454.99435667787 7463139.456755368))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473438 7463144)', 25833), - ST_GeomFromText('POLYGON ((473438.99554283824 7463137.7143898895, 473444.28561010957 7463144.995542839, 473437.00445716083 7463150.28561011, 473431.7143898895 7463143.00445716, 473438.99554283824 7463137.7143898895))', 25833)), + ST_GeomFromText('POLYGON ((473438.99554283824 7463137.7143898895, 473444.28561010957 7463144.995542839, 473437.00445716083 7463150.28561011, 473431.7143898895 7463143.00445716, 473438.99554283824 7463137.7143898895))', 25833), 'commercial', '2021-10-31 09:00:00.000'), (ST_GeomFromText('POINT (473474 7463101)', 25833), - ST_GeomFromText('POLYGON ((473474.83006438427 7463097.491297516, 473477.55805782415 7463100.416712323, 473473.1699356148 7463104.508702483, 473470.441942174 7463101.583287676, 473474.83006438427 7463097.491297516))', 25833)), + ST_GeomFromText('POLYGON ((473474.83006438427 7463097.491297516, 473477.55805782415 7463100.416712323, 473473.1699356148 7463104.508702483, 473470.441942174 7463101.583287676, 473474.83006438427 7463097.491297516))', 25833), 'commercial', '2021-10-31 09:00:00.000'), -- gid 10 (NULL, - ST_GeomFromText('POLYGON ((473464.1495667333 7463116.574655892, 473461.1307284124 7463119.1988920085, 473457.85043326765 7463115.425344108, 473460.8692715885 7463112.8011079915, 473464.1495667333 7463116.574655892))', 25833)), + ST_GeomFromText('POLYGON ((473464.1495667333 7463116.574655892, 473461.1307284124 7463119.1988920085, 473457.85043326765 7463115.425344108, 473460.8692715885 7463112.8011079915, 473464.1495667333 7463116.574655892))', 25833), 'commercial', '2021-10-31 09:00:00.000'), -- gid 11 (ST_GeomFromText('POINT (473461 7463116)', 25833), - NULL), + NULL, 'commercial', '2021-10-31 09:00:00.000'), -- gid 12 (NULL, - NULL); + NULL, 'commercial', '2021-10-31 09:00:00.000'); /* Two tables which create a naming conflict diff --git a/tests/provider/test_postgresql_provider.py b/tests/provider/test_postgresql_provider.py index c8cb8140e..f80f0ad3b 100644 --- a/tests/provider/test_postgresql_provider.py +++ b/tests/provider/test_postgresql_provider.py @@ -41,6 +41,7 @@ # test database in Docker from http import HTTPStatus +import logging import os import json import pytest @@ -67,6 +68,9 @@ from ..util import get_test_file_path, mock_api_request PASSWORD = os.environ.get('POSTGRESQL_PASSWORD', 'postgres') +HITS_FILTER_MESSAGE = "Full count executed (hits or filters)" +PSEUDO_COUNT_MESSAGE = "Pseudo count executed" +TOO_FEW_MESSAGE = "Full count executed (too few features)" @pytest.fixture() @@ -142,6 +146,30 @@ def pg_api_(openapi): return API(config, openapi) +@pytest.fixture() +def config_dummy(): + return { + 'name': 'PostgreSQL', + 'type': 'feature', + 'data': {'host': '127.0.0.1', + 'dbname': 'test', + 'user': 'postgres', + 'password': PASSWORD, + 'search_path': ['dummy', 'public'] + }, + 'options': { + 'connect_timeout': 10 + }, + 'id_field': 'gid', + 'table': 'buildings', + 'geom_field': 'centroid', + 'time_field': 'datetime', + 'postgresql_pseudo_count_enabled': True, + 'postgresql_pseudo_count_start': 1000, + 'storage_crs': 'http://www.opengis.net/def/crs/EPSG/0/25833' + } + + def test_valid_connection_options(config): if config.get('options'): keys = list(config['options'].keys()) @@ -901,3 +929,76 @@ def test_transaction_create_handles_invalid_input_data(pg_api_, data): headers, code, content = manage_collection_item( pg_api_, req, action='create', dataset='hot_osm_waterways') assert 'generic error' in content + + +@pytest.mark.parametrize("result_type, cql_filters, bbox_filter, time_filter, expected_count", [ # noqa + ("hits", None, [], None, 12), + ("", "building_type LIKE 'residential'", [], None, 1), + ("", None, [-2450512.62, 3680451.78, 2665647.82, 9493779.8], None, 10), + ("", None, [], '2021-11-23T09:00:00Z', 1) +]) +def test_resulttype_equals_hits_and_filters_force_full_count( + caplog: pytest.LogCaptureFixture, + config_dummy: dict, + result_type: str, + cql_filters: str, + bbox_filter: list[int], + time_filter: str, + expected_count: int +): + + # Arrange + provider = PostgreSQLProvider(config_dummy) + if cql_filters: + cql_filters = parse(cql_filters) + + with caplog.at_level(logging.DEBUG): + + # Act + results = provider.query( + resulttype=result_type, + filterq=cql_filters, + bbox=bbox_filter, + datetime_=time_filter + ) + + # Assert + assert HITS_FILTER_MESSAGE in caplog.text + assert results['numberMatched'] == expected_count + + +def test_pseudo_count( + caplog: pytest.LogCaptureFixture, + config_dummy: dict +): + + # Arrange + config_dummy['postgresql_pseudo_count_start'] = 1 + provider = PostgreSQLProvider(config_dummy) + + with caplog.at_level(logging.DEBUG): + + # Act + _ = provider.query() + + # Assert + assert PSEUDO_COUNT_MESSAGE in caplog.text + + +def test_postgresql_pseudo_count_start_forces_full_count( + caplog: pytest.LogCaptureFixture, + config_dummy: dict +): + + # Arrange + config_dummy['postgresql_pseudo_count_start'] = 1000000 + provider = PostgreSQLProvider(config_dummy) + + with caplog.at_level(logging.DEBUG): + + # Act + results = provider.query() + + # Assert + assert TOO_FEW_MESSAGE in caplog.text + assert results['numberMatched'] == 12