Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve queries #61

Merged
merged 4 commits into from
Feb 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sensorsafrica/api/v2/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@


class SensorDataStatSerializer(serializers.Serializer):
average = serializers.FloatField()
minimum = serializers.FloatField()
maximum = serializers.FloatField()
calculated_average = serializers.FloatField()
calculated_minimum = serializers.FloatField()
calculated_maximum = serializers.FloatField()
value_type = serializers.CharField(max_length=200)
start_datetime = serializers.DateTimeField()
end_datetime = serializers.DateTimeField()
Expand Down
93 changes: 41 additions & 52 deletions sensorsafrica/api/v2/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from django.utils import timezone
from dateutil.relativedelta import relativedelta
from django.db.models import ExpressionWrapper, F, FloatField, Max, Min, Sum, Avg, Q
from django.db.models.functions import Cast, TruncDate
from django.db.models.functions import Cast, TruncHour, TruncDay, TruncMonth
from rest_framework import mixins, pagination, viewsets

from ..models import SensorDataStat, LastActiveNodes, City, Node
Expand Down Expand Up @@ -60,6 +60,7 @@ def get_paginated_response(self, data_stats):
# If filtering from a date
# We will need to have a list of the value_types e.g. { 'P1': [{}, {}] }
from_date = self.request.query_params.get("from", None)
interval = self.request.query_params.get("interval", None)

results = {}
for data_stat in data_stats:
Expand All @@ -69,19 +70,19 @@ def get_paginated_response(self, data_stats):
if city_slug not in results:
results[city_slug] = {
"city_slug": city_slug,
value_type: [] if from_date else {},
value_type: [] if from_date or interval else {},
}

if value_type not in results[city_slug]:
results[city_slug][value_type] = [] if from_date else {}
results[city_slug][value_type] = [] if from_date or interval else {}

values = results[city_slug][value_type]
include_result = getattr(values, "append" if from_date else "update")
include_result = getattr(values, "append" if from_date or interval else "update")
include_result(
{
"average": data_stat["average"],
"minimum": data_stat["minimum"],
"maximum": data_stat["maximum"],
"average": data_stat["calculated_average"],
"minimum": data_stat["calculated_minimum"],
"maximum": data_stat["calculated_maximum"],
"start_datetime": data_stat["start_datetime"],
"end_datetime": data_stat["end_datetime"],
}
Expand Down Expand Up @@ -112,6 +113,7 @@ def get_queryset(self):
city_slugs = self.request.query_params.get("city", None)
from_date = self.request.query_params.get("from", None)
to_date = self.request.query_params.get("to", None)
interval = self.request.query_params.get("interval", None)

if to_date and not from_date:
raise ValidationError({"from": "Must be provide along with to query"})
Expand All @@ -129,43 +131,10 @@ def get_queryset(self):
)

if not from_date and not to_date:
return self._retrieve_past_24hrs(city_slugs, filter_value_types)

return self._retrieve_range(from_date, to_date, city_slugs, filter_value_types)

@staticmethod
def _retrieve_past_24hrs(city_slugs, filter_value_types):
to_date = timezone.now().replace(minute=0, second=0, microsecond=0)
from_date = to_date - datetime.timedelta(hours=24)

queryset = SensorDataStat.objects.filter(
value_type__in=filter_value_types,
timestamp__gte=from_date,
timestamp__lte=to_date,
)

if city_slugs:
queryset = queryset.filter(city_slug__in=city_slugs.split(","))

return (
queryset.order_by()
.values("value_type", "city_slug")
.annotate(
start_datetime=Min("timestamp"),
end_datetime=Max("timestamp"),
average=ExpressionWrapper(
Sum(F("average") * F("sample_size")) / Sum("sample_size"),
output_field=FloatField(),
),
minimum=Min("minimum"),
maximum=Max("maximum"),
)
.order_by("city_slug")
)

@staticmethod
def _retrieve_range(from_date, to_date, city_slugs, filter_value_types):
if not to_date:
to_date = timezone.now().replace(minute=0, second=0, microsecond=0)
from_date = to_date - datetime.timedelta(hours=24)
interval = 'day' if not interval else interval
elif not to_date:
from_date = beginning_of_day(from_date)
# Get data from_date until the end
# of day yesterday which is the beginning of today
Expand All @@ -177,27 +146,47 @@ def _retrieve_range(from_date, to_date, city_slugs, filter_value_types):
queryset = SensorDataStat.objects.filter(
value_type__in=filter_value_types,
timestamp__gte=from_date,
timestamp__lt=to_date,
timestamp__lte=to_date,
)

if interval == 'month':
truncate = TruncMonth("timestamp")
elif interval == 'day':
truncate = TruncDay("timestamp")
else:
truncate = TruncHour("timestamp")

if city_slugs:
queryset = queryset.filter(city_slug__in=city_slugs.split(","))

return (
queryset.annotate(date=TruncDate("timestamp"))
.values("date", "value_type")
queryset
.values(
"value_type",
"city_slug"
)
.annotate(
city_slug=F("city_slug"),
truncated_timestamp=truncate,
start_datetime=Min("timestamp"),
end_datetime=Max("timestamp"),
average=ExpressionWrapper(
calculated_average=ExpressionWrapper(
Sum(F("average") * F("sample_size")) / Sum("sample_size"),
output_field=FloatField(),
),
minimum=Min("minimum"),
maximum=Max("maximum"),
calculated_minimum=Min("minimum"),
calculated_maximum=Max("maximum"),
)
.values(
"value_type",
"city_slug",
"truncated_timestamp",
"start_datetime",
"end_datetime",
"calculated_average",
"calculated_minimum",
"calculated_maximum"
)
.order_by("-date")
.order_by("city_slug", "-truncated_timestamp")
)


Expand Down
55 changes: 30 additions & 25 deletions sensorsafrica/management/commands/calculate_data_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from feinstaub.sensors.models import Node, Sensor, SensorDataValue, SensorLocation
from ...api.models import SensorDataStat

from django.core.paginator import Paginator


def map_stat(stat, city):
return SensorDataStat(
Expand All @@ -22,6 +24,12 @@ def map_stat(stat, city):
)


def chunked_iterator(queryset, chunk_size=100):
paginator = Paginator(queryset, chunk_size)
for page in range(1, paginator.num_pages + 1):
yield paginator.page(page).object_list


class Command(BaseCommand):
help = "Calculate and store data statistics"

Expand Down Expand Up @@ -64,32 +72,29 @@ def handle(self, *args, **options):
Q(value__regex=r"^\-?\d+(\.?\d+)?$"),
)

stats = list(
for stats in chunked_iterator(
queryset.annotate(timestamp=TruncHour("created"))
.values(
"timestamp",
"value_type",
"sensordata__sensor",
"sensordata__location",
"sensordata__sensor__node",
)
.order_by()
.annotate(
last_datetime=Max("created"),
average=Avg(Cast("value", FloatField())),
minimum=Min(Cast("value", FloatField())),
maximum=Max(Cast("value", FloatField())),
sample_size=Count("created", FloatField()),
)
.filter(
~Q(average=float("NaN")),
~Q(minimum=float("NaN")),
~Q(maximum=float("NaN")),
)
.order_by("-timestamp")
)

if len(stats):
.values(
"timestamp",
"value_type",
"sensordata__sensor",
"sensordata__location",
"sensordata__sensor__node",
)
.order_by()
.annotate(
last_datetime=Max("created"),
average=Avg(Cast("value", FloatField())),
minimum=Min(Cast("value", FloatField())),
maximum=Max(Cast("value", FloatField())),
sample_size=Count("created", FloatField()),
)
.filter(
~Q(average=float("NaN")),
~Q(minimum=float("NaN")),
~Q(maximum=float("NaN")),
)
.order_by("-timestamp")):
SensorDataStat.objects.bulk_create(
list(map(lambda stat: map_stat(stat, city), stats))
)
29 changes: 29 additions & 0 deletions sensorsafrica/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import datetime
import math
from dateutil.relativedelta import relativedelta

import pytest
from django.core.management import call_command
Expand Down Expand Up @@ -229,6 +231,33 @@ def additional_sensorsdatastats(sensors, locations, sensorsdatastats):
call_command("calculate_data_statistics")


@pytest.fixture
def large_sensorsdatastats(sensors, locations):

now = timezone.now()
months = 6
points = math.floor((now - (now - relativedelta(months=months-1))).days * 24 * 60 / 5)
minutes = points * 5 * months
for point in range(1, points):
created_sd = SensorData.objects.create(sensor=sensors[0], location=locations[0])
created_sv = SensorDataValue.objects.create(sensordata=created_sd, value="4", value_type="P2")
created_sv.update_modified = False
created_sv.created = now - datetime.timedelta(minutes=point * 5)
created_sv.save()

last_date = created_sv.created

from django.core.management import call_command

call_command("calculate_data_statistics")

return {
'months': months,
'minutes': minutes,
'last_date': last_date
}


@pytest.fixture
def last_active(sensors, locations, sensorsdatastats):
timestamps = [
Expand Down
23 changes: 23 additions & 0 deletions sensorsafrica/tests/test_large_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import datetime

import pytest
from django.utils import timezone


@pytest.mark.django_db
class TestGettingDataFromLargeDataset:

def test_getting_air_data_on_large_dataset(self, client, large_sensorsdatastats):
response = client.get(
"/v2/data/air/?city=dar-es-salaam&interval=month&from=%s" %
large_sensorsdatastats["last_date"].date(),
format="json",
)
assert response.status_code == 200

data = response.json()

assert data["count"] == 1

assert type(data["results"][0]["P2"]) == list
assert len(data["results"][0]["P2"]) == large_sensorsdatastats["months"]
32 changes: 32 additions & 0 deletions sensorsafrica/tests/test_sensordatastats_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,35 @@ def test_getting_air_data_now_with_additional_values(

assert result["P2"]["maximum"] == 8.0
assert result["P2"]["minimum"] == 0.0

def test_getting_air_data_by_hour(self, client, sensorsdatastats):
response = client.get(
"/v2/data/air/?city=dar-es-salaam&interval=hour",
format="json",
)
assert response.status_code == 200

data = response.json()

assert data["count"] == 1

assert type(data["results"][0]["P1"]) == list
assert len(data["results"][0]["P1"]) == 1
assert type(data["results"][0]["P2"]) == list
assert len(data["results"][0]["P2"]) == 4

def test_getting_air_data_by_month(self, client, sensorsdatastats):
response = client.get(
"/v2/data/air/?city=dar-es-salaam&interval=month",
format="json",
)
assert response.status_code == 200

data = response.json()

assert data["count"] == 1

assert type(data["results"][0]["P1"]) == list
assert len(data["results"][0]["P1"]) == 1
assert type(data["results"][0]["P2"]) == list
assert len(data["results"][0]["P2"]) == 1