Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement faceted search (#845) #920

Merged
merged 4 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions default-sample.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ database=sqlite:////var/www/pycsw/tests/functionaltests/suites/cite/data/cite.db
table=records
#filter=type = 'http://purl.org/dc/dcmitype/Dataset'
#max_retries=5
facets=type,title

[metadata:inspire]
enabled=true
Expand Down
1 change: 1 addition & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pycsw's runtime configuration is defined by ``default.cfg``. pycsw ships with a
- **source**: the source of this repository only if not local (e.g. :ref:`geonode`, :ref:`odc`). Supported values are ``geonode``, ``odc``
- **filter**: server side database filter to apply as mask to all CSW requests (see :ref:`repofilters`)
- **max_retries**: max number of retry attempts when connecting to records-repository database
- **facets**: comma-separated list of facetable properties for search results

.. note::

Expand Down
1 change: 1 addition & 0 deletions pycsw/core/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def __init__(self, database, context, app_root=None, table='records', repo_filte
self.dbtype = self.engine.name

self.session = create_session(self.engine)
self.func = func

temp_dbtype = None

Expand Down
33 changes: 27 additions & 6 deletions pycsw/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def geojson_geometry2bbox(geometry):

return bbox


def wkt2geom(ewkt, bounds=True):
"""Return Shapely geometry object based on WKT/EWKT

Expand All @@ -207,8 +208,9 @@ def wkt2geom(ewkt, bounds=True):
Returns
-------
shapely.geometry.base.BaseGeometry or tuple
Depending on the value of the ``bounds`` parameter, returns either
the shapely geometry instance or a tuple with the bounding box.

Depending on the value of the ``bounds`` parameter, returns either
the shapely geometry instance or a tuple with the bounding box.

References
----------
Expand Down Expand Up @@ -346,8 +348,8 @@ def ipaddress_in_whitelist(ipaddress, whitelist):
if ip_in_network_cidr(ipaddress, white):
return True
elif white.find('*') != -1: # subnet wildcard
if ipaddress.startswith(white.split('*')[0]):
return True
if ipaddress.startswith(white.split('*')[0]):
return True
return False


Expand All @@ -372,7 +374,7 @@ def get_anytext_from_obj(obj):
"""
generate bag of text for free text searches
accepts dict, list or string
"""
"""

if isinstance(obj, dict):
for key, value in obj.items():
Expand Down Expand Up @@ -432,6 +434,7 @@ def secure_filename(filename):

return filename


def jsonify_links(links):
"""
pycsw:Links column data handler.
Expand All @@ -441,7 +444,7 @@ def jsonify_links(links):
LOGGER.debug('JSON link')
linkset = json.loads(links)
return linkset
except json.decoder.JSONDecodeError as err: # try CSV parsing
except json.decoder.JSONDecodeError: # try CSV parsing
LOGGER.debug('old style CSV link')
json_links = []
for link in links.split('^'):
Expand Down Expand Up @@ -525,3 +528,21 @@ def load_custom_repo_mappings(repository_mappings: str) -> typing.Optional[typin
if imported_mappings_module is not None:
result = getattr(imported_mappings_module, "MD_CORE_MODEL", None)
return result


def str2bool(value: typing.Union[bool, str]) -> bool:
"""
helper function to return Python boolean
type (source: https://stackoverflow.com/a/715468)
:param value: value to be evaluated
:returns: `bool` of whether the value is boolean-ish
"""

value2 = False

if isinstance(value, bool):
value2 = value
else:
value2 = value.lower() in ('yes', 'true', 't', '1', 'on')

return value2
14 changes: 13 additions & 1 deletion pycsw/ogc/api/oapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,17 @@ def gen_oapi(config, oapi_filepath, mode='ogcapi-records'):
},
'style': 'form'
}
oapi['components']['parameters']['facets'] = {
'name': 'facets',
'in': 'query',
'description': 'Whether to include facets in results',
'schema': {
'type': 'boolean',
'default': False
},
'style': 'form',
'explode': False
}
# TODO: remove local definition of ids once implemented
# in OGC API - Records
oapi['components']['parameters']['ids'] = {
Expand Down Expand Up @@ -385,7 +396,8 @@ def gen_oapi(config, oapi_filepath, mode='ogcapi-records'):
{'$ref': '#/components/parameters/filter-lang'},
{'$ref': '#/components/parameters/f'},
{'$ref': '#/components/parameters/offset'},
{'$ref': '#/components/parameters/vendorSpecificParameters'}
{'$ref': '#/components/parameters/vendorSpecificParameters'},
{'$ref': '#/components/parameters/facets'},
],
'responses': {
'200': {
Expand Down
50 changes: 48 additions & 2 deletions pycsw/ogc/api/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from pycsw.core.config import StaticContext
from pycsw.core.metadata import parse_record
from pycsw.core.pygeofilter_evaluate import to_filter
from pycsw.core.util import bind_url, get_today_and_now, jsonify_links, load_custom_repo_mappings, wkt2geom
from pycsw.core.util import bind_url, get_today_and_now, jsonify_links, load_custom_repo_mappings, str2bool, wkt2geom
from pycsw.ogc.api.oapi import gen_oapi
from pycsw.ogc.api.util import match_env_var, render_j2_template, to_json

Expand Down Expand Up @@ -101,6 +101,7 @@ def __init__(self, config: ConfigParser):

LOGGER.debug(f'Server URL: {url_}')
self.config['server']['url'] = url_.rstrip('/')
self.facets = self.config['repository'].get('facets', 'type').split(',')

self.context = StaticContext()

Expand Down Expand Up @@ -511,6 +512,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'):

reserved_query_params = [
'f',
'facets',
'filter',
'filter-lang',
'limit',
Expand All @@ -525,6 +527,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'):

response = {
'type': 'FeatureCollection',
'facets': [],
'features': [],
'links': []
}
Expand All @@ -533,6 +536,7 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'):
query_parser = None
sortby = None
limit = None
facets_requested = False
collections = []

if collection not in self.get_all_collections():
Expand Down Expand Up @@ -602,6 +606,8 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'):
else:
query_args.append(f'{k} = "{v}"')

facets_requested = str2bool(args.get('facets', False))

if collection != 'metadata:main':
LOGGER.debug('Adding virtual collection filter')
query_args.append(f'parentidentifier = "{collection}"')
Expand Down Expand Up @@ -661,8 +667,17 @@ def items(self, headers_, json_post_data, args, collection='metadata:main'):
return self.get_exception(400, headers_, 'InvalidParameterValue', msg)

query = self.repository.session.query(self.repository.dataset).filter(filters)
if facets_requested:
LOGGER.debug('Running facet query')
facets_results = self.get_facets(filters)
else:
query = self.repository.session.query(self.repository.dataset)
facets_results = self.get_facets()

if facets_requested:
response['facets'] = facets_results
else:
response.pop('facets')

if 'sortby' in args:
LOGGER.debug('sortby specified')
Expand Down Expand Up @@ -971,7 +986,7 @@ def get_collection_info(self, collection_name: str = 'metadata:main',
}]
}

def get_all_collections(self):
def get_all_collections(self) -> list:
"""
Get all collections

Expand All @@ -983,6 +998,37 @@ def get_all_collections(self):

return [default_collection] + [vc.identifier for vc in virtual_collections]

def get_facets(self, filters=None) -> dict:
"""
Gets all facets for a given query

:returns: `dict` of facets
"""

facets_results = {}

for facet in self.facets:
LOGGER.debug(f'Running facet for {facet}')
facetq = self.repository.session.query(self.repository.query_mappings[facet], self.repository.func.count(facet)).group_by(facet)

if filters is not None:
facetq = facetq.filter(filters)

LOGGER.debug('Writing facet query results')
facets_results[facet] = {
'type': 'terms',
'property': facet,
'buckets': []
}

for fq in facetq.all():
facets_results[facet]['buckets'].append({
'value': fq[0],
'count': fq[1]
})

return facets_results


def record2json(record, url, collection, mode='ogcapi-records'):
"""
Expand Down
33 changes: 26 additions & 7 deletions pycsw/ogc/api/templates/items.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,39 @@

<section id="items">

{% set nav_links = namespace(prev=None, next=None) %}
{% set nav_links = namespace(prev=None, next=None, self=None) %}
{% for link in data['links'] %}
{% if link['rel'] == 'prev' %}
{% set nav_links.prev = link['href'] %}
{% endif %}
{% if link['rel'] == 'next' %}
{% set nav_links.next = link['href'] %}
{% endif %}
{% if link['rel'] == 'prev' %}
{% set nav_links.prev = link['href'] %}
{% endif %}
{% if link['rel'] == 'self' %}
{% set nav_links.self = link['href'] %}
{% endif %}
{% if link['rel'] == 'next' %}
{% set nav_links.next = link['href'] %}
{% endif %}
{% endfor %}

<div class="container-fluid">
<div class="row">
<div class="col-lg-6">
<div id="records-map"></div>
<div id="facets">
{% if data['facets'] %}
{% for facet in data['facets'].keys() %}
<div class="card mt-3">
<div class="card-header text-capitalize">{{ facet }}</div>
<div class="card-body">
{% for bucket in data['facets'][facet].buckets %}
<a href="{{ nav_links.self.split('?')[0] }}?{{facet}}={{bucket['value']}}"
class="text-capitalize">{{bucket['value']}}</a>
<span class="badge rounded-pill bg-secondary" style="float:right">{{bucket['count']}}</span><br>
{% endfor %}
</div>
</div>
{% endfor %}
{% endif %}
</div>
</div>
<div class="col-lg-6">
{% if nav_links.prev %}
Expand Down
Loading