Skip to content

Commit

Permalink
pageserver: handle shutdown cleanly in layer download API (#10598)
Browse files Browse the repository at this point in the history
## Problem

This API is used in tests and occasionally for support. It cast all
errors to 500.

That can cause a failure on the log checks:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/

## Summary of changes

- Avoid using generic anyhow::Error for layer downloads
- Map shutdown cases to 503 in http route
  • Loading branch information
jcsp authored Jan 30, 2025
1 parent d18f619 commit e1273ac
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
8 changes: 7 additions & 1 deletion pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1472,7 +1472,13 @@ async fn layer_download_handler(
let downloaded = timeline
.download_layer(&layer_name)
.await
.map_err(ApiError::InternalServerError)?;
.map_err(|e| match e {
tenant::storage_layer::layer::DownloadError::TimelineShutdown
| tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
ApiError::ShuttingDown
}
other => ApiError::InternalServerError(other.into()),
})?;

match downloaded {
Some(true) => json_response(StatusCode::OK, ()),
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/storage_layer/layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ impl Layer {
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
pub(crate) async fn download(&self) -> anyhow::Result<()> {
pub(crate) async fn download(&self) -> Result<(), DownloadError> {
self.0.get_or_maybe_download(true, None).await?;
Ok(())
}
Expand Down
12 changes: 10 additions & 2 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2028,8 +2028,16 @@ impl Timeline {
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerName,
) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await? else {
) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
let Some(layer) = self
.find_layer(layer_file_name)
.await
.map_err(|e| match e {
layer_manager::Shutdown => {
super::storage_layer::layer::DownloadError::TimelineShutdown
}
})?
else {
return Ok(None);
};

Expand Down
19 changes: 7 additions & 12 deletions test_runner/regress/test_ondemand_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
from fixtures.utils import query_scalar, wait_until
from urllib3 import Retry

if TYPE_CHECKING:
from typing import Any
Expand Down Expand Up @@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
"compaction_period": "0s",
}
)
client = env.pageserver.http_client()

# Disable retries, because we'll hit code paths that can give us
# 503 and want to see that directly
client = env.pageserver.http_client(retries=Retry(status=0))

failpoint = "before-downloading-layer-stream-pausable"
client.configure_failpoints((failpoint, "pause"))

env.pageserver.allowed_errors.extend(
[
".*downloading failed, possibly for shutdown.*",
]
)

info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
assert len(info.delta_layers()) == 1

Expand Down Expand Up @@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu

client.configure_failpoints((failpoint, "off"))

with pytest.raises(
PageserverApiException, match="downloading failed, possibly for shutdown"
):
with pytest.raises(PageserverApiException, match="Shutting down"):
download.result()

env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")

detach.result()

client.configure_failpoints((failpoint, "pause"))
Expand Down

1 comment on commit e1273ac

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7565 tests run: 7203 passed, 0 failed, 362 skipped (full report)


Flaky tests (7)

Postgres 17

Code coverage* (full report)

  • functions: 33.3% (8512 of 25534 functions)
  • lines: 49.1% (71500 of 145559 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
e1273ac at 2025-01-31T01:14:38.342Z :recycle:

Please sign in to comment.