Skip to content

Commit

Permalink
feat: add item_dropped_count and invalid_json_count to job context
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jan 20, 2025
1 parent 71875b8 commit 9079500
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 0 deletions.
21 changes: 21 additions & 0 deletions data_registry/migrations/0060_alter_job_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.18 on 2025-01-20 13:58

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("data_registry", "0059_job_process_notes"),
]

operations = [
migrations.AlterField(
model_name="job",
name="context",
field=models.JSONField(
blank=True,
default=dict,
help_text="<dl><dt><code>spider</code></dt><dd>The name of the spider in Kingfisher Collect</dd><dt><code>data_version</code></dt><dd>The data version of the crawl in Kingfisher Collect</dd><dt><code>job_id</code></dt><dd>The ID of the job in Scrapyd</dd><dt><code>scrapy_log</code></dt><dd>A local URL to the log file of the crawl in Scrapyd</dd><dt><code>item_dropped_count</code></dt><dd>The number of items dropped by the crawl</dd><dt><code>invalid_json_count</code></dt><dd>The number of invalid JSON files dropped by the crawl</dd><dt><code>process_id</code></dt><dd>The ID of the base collection in Kingfisher Process</dd><dt><code>process_id_pelican</code></dt><dd>The ID of the compiled collection in Kingfisher Process</dd><dt><code>pelican_id</code></dt><dd>The ID of the dataset in Pelican</dd><dt><code>pelican_dataset_name</code></dt><dd>The name of the dataset in Pelican</dd></dl>",
),
),
]
4 changes: 4 additions & 0 deletions data_registry/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class Status(models.TextChoices):
"<dd>The ID of the job in Scrapyd</dd>"
"<dt><code>scrapy_log</code></dt>"
"<dd>A local URL to the log file of the crawl in Scrapyd</dd>"
"<dt><code>item_dropped_count</code></dt>"
"<dd>The number of items dropped by the crawl</dd>"
"<dt><code>invalid_json_count</code></dt>"
"<dd>The number of invalid JSON files dropped by the crawl</dd>"
"<dt><code>process_id</code></dt>"
"<dd>The ID of the base collection in Kingfisher Process</dd>"
"<dt><code>process_id_pelican</code></dt>"
Expand Down
2 changes: 2 additions & 0 deletions data_registry/process_manager/task/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def get_status(self):
for stat in ["item_dropped_count", "invalid_json_count"]:
if scrapy_log.logparser["crawler_stats"].get(stat):
logger.warning("%s: crawl %s: %s", self, stat, scrapy_log.logparser["crawler_stats"].get(stat))
self.job.context[stat] = scrapy_log.logparser["crawler_stats"].get(stat)
self.job.save(update_fields=["modified", "context"])

return Task.Status.COMPLETED

Expand Down

0 comments on commit 9079500

Please sign in to comment.