From 0a34e74a7c780625247f4e278b0cb2f929baa960 Mon Sep 17 00:00:00 2001
From: Andy Jackson <Andrew.Jackson@bl.uk>
Date: Thu, 9 Nov 2023 08:45:07 +0000
Subject: [PATCH] Update with separate Kafka UI.

---
 README.md                                |  52 ++++++---
 ingest/README.md                         | 131 ++++++++++++-----------
 ingest/fc/fc-kafka-ui/deploy.sh          |   3 +
 ingest/fc/fc-kafka-ui/docker-compose.yml |  19 ++++
 ingest/fc/fc-kafka/docker-compose.yml    |  11 --
 5 files changed, 128 insertions(+), 88 deletions(-)
 create mode 100644 ingest/fc/fc-kafka-ui/deploy.sh
 create mode 100644 ingest/fc/fc-kafka-ui/docker-compose.yml

diff --git a/README.md b/README.md
index 806ad3e..a556a2c 100755
--- a/README.md
+++ b/README.md
@@ -18,7 +18,10 @@ Deployment configuration for almost all UKWA services.
     - [Access](#access)
     - [Monitoring](#monitoring)
   - [Interfaces](#interfaces)
-  - [Networks](#networks)
+  - [Infrastructure](#infrastructure)
+    - [Access \& Updates](#access--updates)
+    - [Container Platforms](#container-platforms)
+    - [Networks](#networks)
 - [Software](#software)
   - [Deployment Process](#deployment-process)
 
@@ -27,7 +30,11 @@ Deployment configuration for almost all UKWA services.
 
 These [Docker Stack](https://docs.docker.com/engine/reference/commandline/stack/) configurations and related scripts are used to launch and manage our main services.  No internal or sensitive data is kept here -- that is stored in internal `ukwa-services-env` repository as environment variable scripts required for deployment, or as part of the CI/CD system.
 
-Note that some services are not deployed via containers, e.g. the Hadoop clusters, and the Solr and OutbackCDX indexes.  Those are documented elsewhere, but the interaction with those other services will be made clear.
+Note that some services are not deployed via containers, e.g. the Hadoop clusters, and the Solr and OutbackCDX indexes.  This includes a dedicated API server that acts as an intermediary for calls to various internal systems, allowing the implementation details of the current deployment to be kept separate from their external identity. 
+
+For example, our OutbackCDX service is accessed internally as `cdx.api.wa.bl.uk`.  Over recent years, this service has been migrated to new hardware on a number of occasions, but using the `cdx.api.wa.bl.uk` proxy alias has allowed us to minimised downtime when migrating or upgrading the service.
+
+These other services are documented elsewhere, but the interaction with those other services will be made clear.
 
 ## Service Stacks
 
@@ -47,25 +54,23 @@ The process for updating and deploying components is described in [the deploymen
 
 ## High-Level Technical Architecture
 
-This is a high-level introduction to the technical components that make up our web archiving services. The primary purpose of this documentation to try and ensure the whole team have an overview of the whole system, and can work out which components are involved when something goes wrong.
+This is a high-level introduction to the technical components that make up our web archiving services. The primary goal is to provide an overview of the whole system, with a particular focusing on knowing where to look if something goes wrong.
 
 Some wider contextual information can be found at:
 
 *   [http://data.webarchive.org.uk/ukwa-documentation/how-ukwa-works/\_index.html](http://data.webarchive.org.uk/ukwa-documentation/how-ukwa-works/_index.html) (source [https://github.com/ukwa/ukwa-documentation/tree/master/content/how-ukwa-works](https://github.com/ukwa/ukwa-documentation/tree/master/content/how-ukwa-works))
 *   ...TBA...
 
-Note that the images on this page can be found at:
+Note that the images on this page can be found in [this Google Slides presentation.](https://docs.google.com/presentation/d/1MnJfldL7MvJYJ28genZqjmDoHhOlo8dRNqmuZGqa5fc/edit?usp=sharing)
 
-*   [This Google Slides presentation.](https://docs.google.com/presentation/d/1MnJfldL7MvJYJ28genZqjmDoHhOlo8dRNqmuZGqa5fc/edit?usp=sharing)
-*   ...TBA...
 
 ### Overview
 
 ![High-level technical overview of the UKWA systems](2023-11-UKWA-Tech-Arch-Overview.png)
 
-The life-cycle of our web archives can be broken down into five main stages, along with management and monitoring processes covering the whole process. Each stage is defined by it's interfaces, with the data standards and protocols that define what goes in to and out of that stage ([see below for more details](#interfaces)). This allows each stage to evolve independently, as long as it's 'contract' with the other stages is maintained.
+The life-cycle of our web archives can be broken down into five main stages, along with management and monitoring processes covering the whole thing, and the underlying infrastructure that supports it all. Each stage is defined by it's interfaces, with the data standards and protocols that define what goes in to and out of that stage ([see below for more details](#interfaces)). This allows each stage to evolve independently, as long as it's 'contract' with the other stages is maintained.
 
-There are multiple ingest streams, covering different aspects of a single overall workflow, starting with the curation tools that we use to drive the web crawlers. Those harvesting processes pull resources off the web and store them in archival form, to be transferred on HDFS. From there, we can ingest the content into other long-term stores, and can then be used to provide access to individual resources both internally and externally, for all the Legal Deposit libraries. As the system complexities and service levels vary significantly across the different access channels, we identify them as distinct services, while only have one (unified) harvesting service.
+There are multiple ingest streams, integrating different capture processes into a single overall workflow, starting with the curation tools that we use to drive the web crawlers. Those harvesting processes pull resources off the web and store them in archival form, to be transferred on HDFS. From there, we can ingest the content into other long-term stores, and can then be used to provide access to individual resources both internally and externally, for all the Legal Deposit libraries. As the system complexities and service levels vary significantly across the different access channels, we identify them as distinct services, while only have one (unified) harvesting service.
 
 In order to be able to find items of interest among the billions of resources we hold, we run a range of data-mining processes on our collections that generate appropriate metadata, which is then combine with manually-generated annotations (supplied by our curators) and used to build our catalogue records and indexes. These records drive the discovery process, allowing users to find content which can then be displayed over the open web or via the reading room access service (as appropriate).
 
@@ -73,9 +78,9 @@ In order to be able to find items of interest among the billions of resources we
 
 #### Manage
 
-The critical management component is Apache Airflow, which orchestrates almost all web archive activity. For staff, it is accessible at [http://airflow.api.wa.bl.uk](http://airflow.api.wa.bl.uk). Each workflow (or DAG in Airflow terminology) is accessible via the management interface, and the description supplied with each one provides documentation on what the task does. Where possible, each individual task in a workflow involves running a command-line application wrapped in versioned Docker container. Developing our tools as individual command-line applications is intended to make them easier to maintain. The Airflow deployment and workflows are defined in the `./manage` folder, in [./manage/airflow](./manage/airflow)
+The critical management component is Apache Airflow, which orchestrates almost all web archive activity. For staff, it is accessible at [http://airflow.api.wa.bl.uk](http://airflow.api.wa.bl.uk). Each workflow (or DAG in Airflow terminology) is accessible via the management interface, and the description supplied with each one provides documentation on what the task does. Where possible, each individual task in a workflow involves running a single command-line application wrapped in versioned Docker container. Developing our tools as individual command-line applications is intended to make them easier to develop, test and maintain. The Airflow deployment and workflows are defined in the `./manage` folder, in [./manage/airflow](./manage/airflow)
 
-Another important component is `TrackDB`, which contains a list of all the files on our storage systems, and it used by Airflow tasks to keep track of what's been indexed, etc.
+Another important component is `TrackDB`, which contains a list of all the files on our storage systems, and is used by Airflow tasks to keep track of what's been indexed, etc.
 
 See [`manage`](./manage/) for more details.
 
@@ -91,7 +96,7 @@ Storage systems are not deployed as containers, so there are no details here.  W
 
 #### Process
 
-There are various Airflow tasks that process the data from W3ACT or from the Hadoop storage. We use the Python MrJob library to run tasks, which are defined in the `ukwa/ukwa-manage` repository. That is quite a complex system, as it supports Hadoop 0.20.x and Hadoop 3.x, and supports tasks written in Java and Python. See [`ukwa/ukwa-manage`](https://github.com/ukwa/ukwa-manage) for more information.
+There are various Airflow tasks that process the data from W3ACT or from the Hadoop storage. We use the [Python MrJob library](https://mrjob.readthedocs.io/) to run tasks, which are defined in the `ukwa/ukwa-manage` repository. That is quite a complex system, as it supports Hadoop 0.20.x and Hadoop 3.x, and supports tasks written in Java and Python. See [`ukwa/ukwa-manage`](https://github.com/ukwa/ukwa-manage) for more information.
 
 #### Access
 
@@ -103,21 +108,36 @@ Our two main access services are:
 See [`access`](./access/) for more details.
 
 #### Monitoring
-
-Runs independently of all other systems, on separate dedicated hardware. Uses the Prometheus stack with alerts defined for major critical processes. See [https://github.com/ukwa/ukwa-monitor](https://github.com/ukwa/ukwa-monitor) for detail.
+ 
+Monitoring runs independently of all other systems, on separate dedicated hardware. Based on [Prometheus](https://prometheus.io/), with alerts defined for major critical processes. See [https://github.com/ukwa/ukwa-monitor](https://github.com/ukwa/ukwa-monitor) for detail.
 
 
 ### Interfaces
 
+There are data standards/protocols that isolate parts of the system so they can evolve independently (see [_How do you cut a monolith in half?_](https://programmingisterrible.com/post/162346490883/how-do-you-cut-a-monolith-in-half) for more on this idea).
+
 | Interface | Protocol | Further Details |
 | --------- | -------- | --------------- |
 | Curate > Crawl | Crawl feeds (seeds, frequencies, etc.), NEVER-CRAWL list. | Generated from W3ACT, see the [w3act\_export workflow](http://airflow.api.wa.bl.uk/dags/w3act_export/grid). |
 | Crawl > Storage | WARC/WACZ files and logs. | These are stored locally then moved to HDFS using Cron jobs (FC) and Airflow (DC, see [copy\_to\_hdfs\_crawler08](http://airflow.api.wa.bl.uk/dags/copy_to_hdfs_crawler08/grid)). | See the [HDFS layout](HDFS-file-system-layout-and-content_154765461.html) page which describes how we expect content to be layed out so it's provenance and nature are clear. |
 | Storage > Process | WARC/WACZ files and logs, Metadata from W3ACT exports. | This covers indexing tasks like CDX generation, full-text indexing etc. |
-| Process > Access |  WARCs/WACZ on HDFS via HTTP API + TrackDB. OutbackCDX API. Solr Full-text and Collections APIs. Data exported by w3act\_export (allows.aclj, blocks.aclj) | As the collection is large, access is powered by APIs rather than file-level standards.| 
+| Process > Access |  WARCs/WACZ on HDFS via HTTP API + TrackDB. OutbackCDX API. Solr Full-text and Collections APIs. Data exported by `w3act\_export` (allows.aclj, blocks.aclj) | As the collection is large, access is powered by APIs rather than file-level standards.| 
+
+### Infrastructure
+
+#### Access & Updates
+
+A central server known as _wash_ is used to log into all system, and runs updates and logging etc. at the system level via Cron jobs.
+
+A pair of servers use IP-failover to host the `*.api.wa.bl.uk` domains, running NGINX to proxy internal services to the appropriate back-end system.
+
+#### Container Platforms
+
+At the time of writing, we use Docker Swarm for production container deployment, and have a set of servers hosting _PROD_, _BETA_ and _DEV_ swarms.
+
 
 
-### Networks
+#### Networks
 
 The systems configured or maintained by the web archiving technical team are located on the following networks.
 
@@ -135,7 +155,7 @@ Almost our entire stack is open source, and the most critical components are co-
 
 Current upgrade work in progress:
 
-*   Reading Room access currently depends on OpenWayback but should be replaced with a modernized PyWB service through the [TP0012 Legal Deposit Access Solution](https://wiki.bl.uk:8443/display/WAG/TP0012+Legal+Deposit+Access+Solution) project.
+*   Reading Room access currently depends on OpenWayback but should be replaced with a modernized PyWB service through the _Legal Deposit Access Solution_ project.
 *   Adoption of Browsertrix Cloud for one-off crawls, with the intent to move all Frequent Crawls into it eventually.
 *   A new approach is needed to manage monitoring and replication of content across H020, H3 BSP and H3 NLS.
 *   Full-scale fulltext indexing remains a challenge and new workflows are needed.
diff --git a/ingest/README.md b/ingest/README.md
index 816cfbe..9daf941 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -2,11 +2,6 @@ The Ingest Stacks <!-- omit in toc -->
 =================
 
 - [Introduction](#introduction)
-- [Workflows](#workflows)
-  - [How the Frequent Crawler works](#how-the-frequent-crawler-works)
-  - [How the Document Harvester works](#how-the-document-harvester-works)
-    - [Known Failure Modes](#known-failure-modes)
-    - [Debugging Approach](#debugging-approach)
 - [Operations](#operations)
   - [Crawler Service Operations](#crawler-service-operations)
     - [Launching the Services](#launching-the-services)
@@ -18,6 +13,11 @@ The Ingest Stacks <!-- omit in toc -->
     - [Pause the crawl job(s)](#pause-the-crawl-jobs)
     - [Checkpoint the job(s)](#checkpoint-the-jobs)
     - [Shutdown](#shutdown-1)
+- [Workflows](#workflows)
+  - [How the Frequent Crawler works](#how-the-frequent-crawler-works)
+  - [How the Document Harvester works](#how-the-document-harvester-works)
+    - [Known Failure Modes](#known-failure-modes)
+    - [Debugging Approach](#debugging-approach)
 
 
 Introduction
@@ -31,70 +31,24 @@ This section covers the service stacks that are used for curation and for crawli
 
 The [`crawl_log_db`](./crawl_log_db/) service is not in use, but contains a useful example of how a Solr service and it's associated schema can be set up using the Solr API rather than maintaining XML configuration files.
 
-Workflows
----------
-
-The Ingest services work together in quite complicated ways, so this section attempts to describe some of the core workflows.  This should help determine what's happened if anything goes wrong.
-
-### How the Frequent Crawler works
-
-
-
-### How the Document Harvester works
-
-1.  Curators mark Targets as being Watched in W3ACT.
-2.  The [`w3act_export` workflow](http://airflow.api.wa.bl.uk/dags/w3act_export/grid) running on Airflow exports the data from W3ACT into files that contain this information.
-3.  The usual move-to-hdfs scripts move WARCs and logs onto the Hadoop store.
-4.  The TrackDB file tracking database gets updated so recent WARCs and crawl logs are known to the system. (See the `update_trackdb_*` tasks on [http://airflow.api.wa.bl.uk](http://airflow.api.wa.bl.uk/home)/).
-5.  The usual web archiving workflow indexes WARCs into the CDX service so items become available.
-6.  The Document Harvester [`ddhapt_log_analyse` workflow](http://airflow.api.wa.bl.uk/dags/ddhapt_log_analyse/grid) runs Hadoop jobs that take the W3ACT export data and use it to find potential documents in the crawl log.
-    1.  This currently means PDF files on Watched Targets.
-    2.  For each, a record is pushed to a dedicate PostgreSQL Document Database (a part of the W3ACT stack), with a status of _NEW_.
-7.  The Document Harvester [ddhapt\_process\_docs workflow](http://airflow.api.wa.bl.uk/dags/ddhapt_process_docs/grid) gets the most recent _NEW_ documents from the Document Database and attempts to enrich the metadata and post them to W3ACT.
-    1.  Currently, the metadata enrichment process talks to the live web rather than the web archive.
-    2.  In general, PDFs are associated with the website they are found from (the landing page), linked to the Target.
-    3.  For GOV.UK, we rely on the PDFs having a rel=up HTTP header that unambigiously links a PDF to it's landing page.
-    4.  The enriched metadata is then used to push a request to W3ACT. This metadata includes an access URL that points to the UKWA website on the public web ([see here for details](https://github.com/ukwa/ukwa-services/blob/aa95df6854382e6b6e84edc697dcb4da2804ef9c/access/website/config/nginx.conf#L154-L155)).
-    5.  W3ACT checks the file in question can be accessed via Wayback and calculates the checksum of the payload, or throws an error if it's not ready yet.
-    6.  If the submission works, the record is updated in the Document Database so it's no longer _NEW_.
-    7.  If it fails, it will be re-run in the future, so once it's available in Wayback it should turn up in W3ACT.
-8.  Curators review the Documents found for the Targets they own, and update the metadata as needed.
-9.  Curators then submit the Documents, which creats a XML SIP file that is passed to a DLS ingest process.
-10.  The DLS ingest process passes the metadata to MER and to Aleph.
-11.  The MER version is not used further.
-12.  The Aleph version then becomes the master metadata record, and is passed to Primo and LDLs via the Metadata Aggregator.
-13.  Links in e.g. Primo point to the access URLs included with the records, meaning users can find and access the documents.
-
-#### Known Failure Modes
-
-The Document Harvester has been fairly reliable in recent years, but some known failure modes may help resolve issues.
-
-*   Under certain circumstances, Heritrix has been known to stop rotating crawl logs properly. If this happens, crawl log files may stop appearing or get lost. Fixing this may require creating an empty crawl.log file in the right place so a checkpoint can rotate the files correctly, or in the worst cases, a full crawler restart. If this happens, crawl logs will stop arriving on HDFS.
-*   If there is a problem with the file tracking database getting updated to slowly, then the Document Harvester Airflow workflows may run but see nothing to process. This can be determined by checking the logs via Airflow, and checking that the expected number of crawl log files for that day were found. Clearing the job so Airflow re-runs it will resolve any gaps.
-*   If there is a problem with W3ACT (either directly, or with how it talks to the curators Wayback instance), then jobs may fail to upload processed Documents to W3ACT. This can be spotted by checking the logs via Airflow, but note that any Documents that have not yet been CDX indexed are expected to be logged as errors at this point, so it can be difficult to tell things apart. It may be necessary to inspect the W3ACT container logs to determine if there's a problem with W3ACT itself.
-
-#### Debugging Approach
-
-Problems will generally be raised by Jennie Grimshaw, who is usually able and happy to supply some example Document URLs that should have been spotted. This is very useful in that it provides some test URLs to run checks with, e.g.
-
-*   Check the URLs actually work and use `curl -v` to see if the `Link: rel=up` header is present (for GOV.UK) which helps find the landing page URL.
-*   Check the crawl-time CDX index (currently at [http://crawler06.bl.uk:8081/fc](http://crawler06.bl.uk:8081/fc)) to check if the URLs have been crawler at all.
-*   Check the access time CDX index (currently at [http://cdx.api.wa.bl.uk/data-heritrix](http://cdx.api.wa.bl.uk/data-heritrix)) to check if the items have been indexed correctly.
-*   Check the Curator Wayback service ([https://www.webarchive.org.uk/act/wayback/archive/](https://www.webarchive.org.uk/act/wayback/archive/)) to see if the URLs are accessible.
-*   Query the PostgreSQL Document Database to see if the URL was found by the crawl log processor and what the status of it is.
-
-Overall, the strategy is to work out where the problem has occurred in the chain of events outlined in the first section, and then modify and/or re-run the workflows as needed.
-
+- [ ] TBA move-to-S3?
 
 Operations
 ----------
 
-This section covers some common operations when interacting with the Ingest services. In particular, the operations for the Frequent Crawler and the Domain Crawler are very similar, so these are documented here.
+This section covers some common operations when interacting with the Ingest services. In particular, the operations for the Frequent Crawler (FC) and the Domain Crawler (DC) are very similar, so these are documented here.
 
 ### Crawler Service Operations
 
+Both the FC and the DC use the same software and same set of service stacks, just with different configuration via various environment variables.
+
+Both services have:
+
+* A Kafka stack, as Kafka used to launch crawls and capture a copy of the crawl log.  This should always be started first, as Heritrix doesn't not always cope when Kafka is not up and running. Note that the service being ready can take a lot longer than the Docker service takes to start up, depending on how large the topic logs are.  
+* A Kafka UI stack. This is optional, but useful for checking Kafka is actually ready for use, and for inspecting the contents of the Kafka topics.
+* A Worker stack, which contains one or two Heritrix instances, and supporting services like ClamAV.
+* A Wayback stack, which is optional, and can be used to look at what has been crawled (as long as the WARCs are still held locally). 
 
-TBA move-to-S3?
 
 #### Launching the Services
 
@@ -191,3 +145,58 @@ This is because the system will not attempt a new checkpoint if the crawl state
 At this point, all activity should have stopped, so it should not make much difference how exactly the service is halted.  To attempt to keep things as clean as possible, first terminate and then teardown the job(s) via the Heritrix UI.
 
 You can now shut down the services...
+
+
+Workflows
+---------
+
+The Ingest services work together in quite complicated ways, so this section attempts to describe some of the core workflows.  This should help determine what's happened if anything goes wrong.
+
+### How the Frequent Crawler works
+
+
+
+### How the Document Harvester works
+
+1.  Curators mark Targets as being Watched in W3ACT.
+2.  The [`w3act_export` workflow](http://airflow.api.wa.bl.uk/dags/w3act_export/grid) running on Airflow exports the data from W3ACT into files that contain this information.
+3.  The usual move-to-hdfs scripts move WARCs and logs onto the Hadoop store.
+4.  The TrackDB file tracking database gets updated so recent WARCs and crawl logs are known to the system. (See the `update_trackdb_*` tasks on [http://airflow.api.wa.bl.uk](http://airflow.api.wa.bl.uk/home)/).
+5.  The usual web archiving workflow indexes WARCs into the CDX service so items become available.
+6.  The Document Harvester [`ddhapt_log_analyse` workflow](http://airflow.api.wa.bl.uk/dags/ddhapt_log_analyse/grid) runs Hadoop jobs that take the W3ACT export data and use it to find potential documents in the crawl log.
+    1.  This currently means PDF files on Watched Targets.
+    2.  For each, a record is pushed to a dedicate PostgreSQL Document Database (a part of the W3ACT stack), with a status of _NEW_.
+7.  The Document Harvester [ddhapt\_process\_docs workflow](http://airflow.api.wa.bl.uk/dags/ddhapt_process_docs/grid) gets the most recent _NEW_ documents from the Document Database and attempts to enrich the metadata and post them to W3ACT.
+    1.  Currently, the metadata enrichment process talks to the live web rather than the web archive.
+    2.  In general, PDFs are associated with the website they are found from (the landing page), linked to the Target.
+    3.  For GOV.UK, we rely on the PDFs having a rel=up HTTP header that unambigiously links a PDF to it's landing page.
+    4.  The enriched metadata is then used to push a request to W3ACT. This metadata includes an access URL that points to the UKWA website on the public web ([see here for details](https://github.com/ukwa/ukwa-services/blob/aa95df6854382e6b6e84edc697dcb4da2804ef9c/access/website/config/nginx.conf#L154-L155)).
+    5.  W3ACT checks the file in question can be accessed via Wayback and calculates the checksum of the payload, or throws an error if it's not ready yet.
+    6.  If the submission works, the record is updated in the Document Database so it's no longer _NEW_.
+    7.  If it fails, it will be re-run in the future, so once it's available in Wayback it should turn up in W3ACT.
+8.  Curators review the Documents found for the Targets they own, and update the metadata as needed.
+9.  Curators then submit the Documents, which creats a XML SIP file that is passed to a DLS ingest process.
+10.  The DLS ingest process passes the metadata to MER and to Aleph.
+11.  The MER version is not used further.
+12.  The Aleph version then becomes the master metadata record, and is passed to Primo and LDLs via the Metadata Aggregator.
+13.  Links in e.g. Primo point to the access URLs included with the records, meaning users can find and access the documents.
+
+#### Known Failure Modes
+
+The Document Harvester has been fairly reliable in recent years, but some known failure modes may help resolve issues.
+
+*   Under certain circumstances, Heritrix has been known to stop rotating crawl logs properly. If this happens, crawl log files may stop appearing or get lost. Fixing this may require creating an empty crawl.log file in the right place so a checkpoint can rotate the files correctly, or in the worst cases, a full crawler restart. If this happens, crawl logs will stop arriving on HDFS.
+*   If there is a problem with the file tracking database getting updated to slowly, then the Document Harvester Airflow workflows may run but see nothing to process. This can be determined by checking the logs via Airflow, and checking that the expected number of crawl log files for that day were found. Clearing the job so Airflow re-runs it will resolve any gaps.
+*   If there is a problem with W3ACT (either directly, or with how it talks to the curators Wayback instance), then jobs may fail to upload processed Documents to W3ACT. This can be spotted by checking the logs via Airflow, but note that any Documents that have not yet been CDX indexed are expected to be logged as errors at this point, so it can be difficult to tell things apart. It may be necessary to inspect the W3ACT container logs to determine if there's a problem with W3ACT itself.
+
+#### Debugging Approach
+
+Problems will generally be raised by Jennie Grimshaw, who is usually able and happy to supply some example Document URLs that should have been spotted. This is very useful in that it provides some test URLs to run checks with, e.g.
+
+*   Check the URLs actually work and use `curl -v` to see if the `Link: rel=up` header is present (for GOV.UK) which helps find the landing page URL.
+*   Check the crawl-time CDX index (currently at [http://crawler06.bl.uk:8081/fc](http://crawler06.bl.uk:8081/fc)) to check if the URLs have been crawler at all.
+*   Check the access time CDX index (currently at [http://cdx.api.wa.bl.uk/data-heritrix](http://cdx.api.wa.bl.uk/data-heritrix)) to check if the items have been indexed correctly.
+*   Check the Curator Wayback service ([https://www.webarchive.org.uk/act/wayback/archive/](https://www.webarchive.org.uk/act/wayback/archive/)) to see if the URLs are accessible.
+*   Query the PostgreSQL Document Database to see if the URL was found by the crawl log processor and what the status of it is.
+
+Overall, the strategy is to work out where the problem has occurred in the chain of events outlined in the first section, and then modify and/or re-run the workflows as needed.
diff --git a/ingest/fc/fc-kafka-ui/deploy.sh b/ingest/fc/fc-kafka-ui/deploy.sh
new file mode 100644
index 0000000..9275d24
--- /dev/null
+++ b/ingest/fc/fc-kafka-ui/deploy.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+docker stack deploy -c docker-compose.yml fc_ui_kafka
+
diff --git a/ingest/fc/fc-kafka-ui/docker-compose.yml b/ingest/fc/fc-kafka-ui/docker-compose.yml
new file mode 100644
index 0000000..fd1da55
--- /dev/null
+++ b/ingest/fc/fc-kafka-ui/docker-compose.yml
@@ -0,0 +1,19 @@
+version: "3.2"
+
+services:
+
+  kafka-ui:
+    image: provectuslabs/kafka-ui:latest
+    ports:
+      - 9000:8080
+    environment:
+      - "KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=kafka:9092"
+      - "DYNAMIC_CONFIG_ENABLED=true"
+    networks:
+      - kafka
+
+networks:
+  kafka:
+    external:
+      name: fc_kafka_default
+
diff --git a/ingest/fc/fc-kafka/docker-compose.yml b/ingest/fc/fc-kafka/docker-compose.yml
index 7cc7efd..ed2a5d1 100644
--- a/ingest/fc/fc-kafka/docker-compose.yml
+++ b/ingest/fc/fc-kafka/docker-compose.yml
@@ -51,17 +51,6 @@ services:
     #  mode: global
 
 
-  # Kafka UI
-  ui:
-    image: ukwa/docker-trifecta
-    ports:
-      - "9000:9000"
-    environment:
-      - "ZK_HOST=zookeeper:2181"
-    depends_on:
-      - zookeeper
-      - kafka
-
 networks:
   # Allow attachment of transient containers, external monitoring:
   default: