Skip to content

Commit

Permalink
Hyphenate addresses for search
Browse files Browse the repository at this point in the history
Also add feature to exclude words from search due to performance issues.
  • Loading branch information
mhieta committed May 23, 2024
1 parent 6310c9b commit 42c4657
Show file tree
Hide file tree
Showing 10 changed files with 241 additions and 18 deletions.
18 changes: 18 additions & 0 deletions services/fixtures/exclusion_words.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"model": "services.exclusionword",
"pk": 1,
"fields": {
"word": "katu",
"language_short": "fi"
}
},
{
"model": "services.exclusionword",
"pk": 2,
"fields": {
"word": "tie",
"language_short": "fi"
}
}
]
62 changes: 54 additions & 8 deletions services/management/commands/index_search_columns.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
from datetime import datetime, timedelta

from django.contrib.postgres.search import SearchVector
from django.core.management.base import BaseCommand
from django.utils import timezone
from munigeo.models import Address, AdministrativeDivision

from services.models import Service, ServiceNode, Unit
from services.search.utils import hyphenate
from services.search.constants import HYPHENATE_ADDRESSES_MODIFIED_WITHIN_DAYS
from services.search.utils import get_foreign_key_attr, hyphenate

logger = logging.getLogger("services.management")

Expand All @@ -27,27 +30,40 @@ def get_search_column(model, lang):
return search_column


def generate_syllables(model):
def generate_syllables(
model, hyphenate_all_addresses=False, hyphenate_addresses_from=None
):
"""
Generates syllables for the given model.
"""
# Disable sending of signals
model._meta.auto_created = True
save_kwargs = {}
num_populated = 0
for row in model.objects.all():
if model.__name__ == "Address" and not hyphenate_all_addresses:
save_kwargs["skip_modified_at"] = True
if not hyphenate_addresses_from:
hyphenate_addresses_from = Address.objects.latest(
"modified_at"
).modified_at - timedelta(days=HYPHENATE_ADDRESSES_MODIFIED_WITHIN_DAYS)
qs = model.objects.filter(modified_at__gte=hyphenate_addresses_from)
else:
qs = model.objects.all()
for row in qs:
row.syllables_fi = []
for column in model.get_syllable_fi_columns():
row_content = getattr(row, column, None)
row_content = get_foreign_key_attr(row, column)
if row_content:
# Rows might be of type str or Array, if str
# cast to array by splitting.
if isinstance(row_content, str):
row_content = row_content.split()
for word in row_content:
syllables = hyphenate(word)
for s in syllables:
row.syllables_fi.append(s)
row.save()
if len(syllables) > 1:
for s in syllables:
row.syllables_fi.append(s)
row.save(**save_kwargs)
num_populated += 1
# Enable sending of signals
model._meta.auto_created = False
Expand Down Expand Up @@ -85,13 +101,43 @@ def index_servicenodes(lang):


class Command(BaseCommand):
def handle(self, *args, **kwargs):
def add_arguments(self, parser):
parser.add_argument(
"--hyphenate_addresses_from",
nargs="?",
type=str,
help="Hyphenate addresses whose modified_at timestamp starts at given timestamp YYYY-MM-DDTHH:MM:SS",
)

parser.add_argument(
"--hyphenate_all_addresses",
action="store_true",
help="Hyphenate all addresses",
)

def handle(self, *args, **options):
hyphenate_all_addresses = options.get("hyphenate_all_addresses", None)
hyphenate_addresses_from = options.get("hyphenate_addresses_from", None)

if hyphenate_addresses_from:
try:
hyphenate_addresses_from = timezone.make_aware(
datetime.strptime(hyphenate_addresses_from, "%Y-%m-%dT%H:%M:%S")
)
except ValueError as err:
raise ValueError(err)
for lang in ["fi", "sv", "en"]:
key = "search_column_%s" % lang
# Only generate syllables for the finnish language
if lang == "fi":
logger.info(f"Generating syllables for language: {lang}.")
logger.info(f"Syllables generated for {generate_syllables(Unit)} Units")
num_populated = generate_syllables(
Address,
hyphenate_all_addresses=hyphenate_all_addresses,
hyphenate_addresses_from=hyphenate_addresses_from,
)
logger.info(f"Syllables generated for {num_populated} Addresses")
logger.info(
f"Syllables generated for {generate_syllables(Service)} Services"
)
Expand Down
37 changes: 37 additions & 0 deletions services/migrations/0117_exclusionword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Generated by Django 5.0.6 on 2024-05-20 10:19

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("services", "0116_alter_unit_address_postal_full_and_more"),
]

operations = [
migrations.CreateModel(
name="ExclusionWord",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("word", models.CharField(max_length=100, verbose_name="Word")),
(
"language_short",
models.CharField(max_length=2, verbose_name="Language short"),
),
],
options={
"verbose_name": "Exclusion word",
"verbose_name_plural": "Exclusion words",
"ordering": ["-id"],
},
),
]
2 changes: 1 addition & 1 deletion services/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .keyword import Keyword
from .mobility import MobilityServiceNode
from .notification import Announcement, ErrorMessage
from .search_rule import ExclusionRule
from .search_rule import ExclusionRule, ExclusionWord
from .service import Service, UnitServiceDetails
from .service_mapping import ServiceMapping
from .service_node import ServiceNode
Expand Down
13 changes: 13 additions & 0 deletions services/models/search_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,16 @@ class Meta:

def __str__(self):
return "%s : %s" % (self.word, self.exclusion)


class ExclusionWord(models.Model):
word = models.CharField(max_length=100, verbose_name=_("Word"))
language_short = models.CharField(max_length=2, verbose_name=_("Language short"))

class Meta:
ordering = ["-id"]
verbose_name = _("Exclusion word")
verbose_name_plural = _("Exclusion words")

def __str__(self):
return self.word
10 changes: 9 additions & 1 deletion services/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
from drf_spectacular.utils import extend_schema, OpenApiParameter
from munigeo import api as munigeo_api
from munigeo.models import Address, AdministrativeDivision
from rest_framework import serializers
from rest_framework import serializers, status
from rest_framework.exceptions import ParseError
from rest_framework.generics import GenericAPIView
from rest_framework.response import Response

from services.api import (
TranslatedModelSerializer,
Expand Down Expand Up @@ -60,6 +61,7 @@
get_preserved_order,
get_service_node_results,
get_trigram_results,
has_exclusion_word_in_query,
set_address_fields,
set_service_node_unit_count,
set_service_unit_count,
Expand Down Expand Up @@ -468,6 +470,12 @@ def get(self, request):
else:
search_query_str = f"{q}:*"

if has_exclusion_word_in_query(q_vals, language_short):
return Response(
f"Search query {q_vals} would return too many results",
status=status.HTTP_400_BAD_REQUEST,
)

search_fn = "to_tsquery"
if use_websearch:
exclusions = self.get_search_exclusions(q)
Expand Down
4 changes: 3 additions & 1 deletion services/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
"Address",
)
QUERY_PARAM_TYPE_NAMES = [m.lower() for m in SEARCHABLE_MODEL_TYPE_NAMES]
# None will slice to the end of list, e.g. no limit.
# None will slice to the end of list, i.e. no limit.
DEFAULT_MODEL_LIMIT_VALUE = None
# The limit value for the search query that search the search_view. "NULL" = no limit
DEFAULT_SEARCH_SQL_LIMIT_VALUE = "NULL"
DEFAULT_TRIGRAM_THRESHOLD = 0.15
DEFAULT_RANK_THRESHOLD = 0

HYPHENATE_ADDRESSES_MODIFIED_WITHIN_DAYS = 7
29 changes: 28 additions & 1 deletion services/search/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,19 @@
)
from rest_framework.test import APIClient

from services.management.commands.index_search_columns import get_search_column
from services.management.commands.index_search_columns import (
generate_syllables,
get_search_column,
)
from services.management.commands.services_import.services import (
update_service_counts,
update_service_node_counts,
update_service_root_service_nodes,
)
from services.models import (
Department,
ExclusionRule,
ExclusionWord,
Service,
ServiceNode,
Unit,
Expand Down Expand Up @@ -243,6 +248,15 @@ def addresses(streets, municipality):
number=1,
full_name="Tarkk'ampujankatu 1",
)
Address.objects.create(
municipality_id=municipality.id,
location=Point(60.44879002342721, 22.283629416961055),
id=7,
street_id=46,
number=1,
full_name="Kellonsoittajankatu 1",
)
generate_syllables(Address)
Address.objects.update(search_column_fi=get_search_column(Address, "fi"))
return Address.objects.all()

Expand Down Expand Up @@ -280,4 +294,17 @@ def streets():
Street.objects.create(id=43, name="Markulantie", municipality_id="helsinki")
Street.objects.create(id=44, name="Yliopistonkatu", municipality_id="helsinki")
Street.objects.create(id=45, name="Tarkk'ampujankatu", municipality_id="helsinki")
Street.objects.create(id=46, name="Kellonsoittajankatu", municipality_id="helsinki")
return Street.objects.all()


@pytest.fixture
def exclusion_rules():
ExclusionRule.objects.create(id=1, word="tekojää", exclusion="-nurmi")
return ExclusionRule.objects.all()


@pytest.fixture
def exclusion_words():
ExclusionWord.objects.create(id=1, word="katu", language_short="fi")
return ExclusionWord.objects.all()
18 changes: 18 additions & 0 deletions services/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def test_search(
administrative_division,
accessibility_shortcoming,
municipality,
exclusion_rules,
exclusion_words,
):
# Search for "museo" in entities: units,services and servicenods
url = reverse("search") + "?q=museo&type=unit,service,servicenode"
Expand Down Expand Up @@ -120,6 +122,22 @@ def test_search(
assert kurrapolku["location"]["type"] == "Point"
assert kurrapolku["location"]["coordinates"][0] == 60.479032
assert kurrapolku["location"]["coordinates"][1] == 22.25417
# Test search with excluded word
url = reverse("search") + "?q=katu"
response = api_client.get(url)
assert response.status_code == 400
url = reverse("search") + "?q=Katu"
response = api_client.get(url)
assert response.status_code == 400
url = reverse("search") + "?q=koti katu"
response = api_client.get(url)
assert response.status_code == 400
# Test search with 'kello'
url = reverse("search") + "?q=kello&type=address"
response = api_client.get(url)
results = response.json()["results"]
assert len(results) == 1
assert results[0]["name"]["fi"] == "Kellonsoittajankatu 1"
# Test address search with apostrophe in query
url = reverse("search") + "?q=tarkk'ampujankatu&type=address"
response = api_client.get(url)
Expand Down
Loading

0 comments on commit 42c4657

Please sign in to comment.