Skip to content

Commit

Permalink
fix(pageserver): handle lsn lease requests for unnormalized lsns (#9137)
Browse files Browse the repository at this point in the history
Fixes #9098.

## Problem

See
#9098 (comment).

### Related

A similar problem happened with branch creation, which was discussed
[here](#2143 (comment))
and fixed by #2529.

## Summary of changes

- Normalize the lsn on pageserver side upon lsn lease request, stores
the normalized LSN.

Signed-off-by: Yuchen Liang <[email protected]>
  • Loading branch information
yliang412 authored Sep 25, 2024
1 parent c597238 commit d447f49
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
6 changes: 5 additions & 1 deletion pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;

use postgres_connection::PgConnectionConfig;
use postgres_ffi::to_pg_timestamp;
use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
use utils::{
completion,
generation::Generation,
Expand Down Expand Up @@ -1337,6 +1337,10 @@ impl Timeline {
_ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
let lease = {
// Normalize the requested LSN to be aligned, and move to the first record
// if it points to the beginning of the page (header).
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);

let mut gc_info = self.gc_info.write().unwrap();

let valid_until = SystemTime::now() + length;
Expand Down
15 changes: 11 additions & 4 deletions test_runner/regress/test_readonly_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
Test static endpoint is protected from GC by acquiring and renewing lsn leases.
"""

LSN_LEASE_LENGTH = 8
neon_env_builder.num_pageservers = 2
# GC is manual triggered.
env = neon_env_builder.init_start(
Expand All @@ -139,7 +140,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
"image_creation_threshold": "1",
"image_layer_creation_check_threshold": "0",
# Short lease length to fit test.
"lsn_lease_length": "3s",
"lsn_lease_length": f"{LSN_LEASE_LENGTH}s",
},
initial_tenant_shard_count=2,
)
Expand Down Expand Up @@ -170,10 +171,14 @@ def generate_updates_on_main(
with env.endpoints.create_start("main") as ep_main:
with ep_main.cursor() as cur:
cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
lsn = None
lsn = Lsn(0)
for i in range(2):
lsn = generate_updates_on_main(env, ep_main, i)

# Round down to the closest LSN on page boundary (unnormalized).
XLOG_BLCKSZ = 8192
lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)

with env.endpoints.create_start(
branch_name="main",
endpoint_id="static",
Expand All @@ -183,7 +188,8 @@ def generate_updates_on_main(
cur.execute("SELECT count(*) FROM t0")
assert cur.fetchone() == (ROW_COUNT,)

time.sleep(3)
# Wait for static compute to renew lease at least once.
time.sleep(LSN_LEASE_LENGTH / 2)

generate_updates_on_main(env, ep_main, i, end=100)

Expand All @@ -204,8 +210,9 @@ def generate_updates_on_main(
# Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100)

# Wait for the existing lease to expire.
time.sleep(LSN_LEASE_LENGTH)
# Now trigger GC again, layers should be removed.
time.sleep(4)
for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client()
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
Expand Down

1 comment on commit d447f49

@github-actions
Copy link

@github-actions github-actions bot commented on d447f49 Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5185 tests run: 5007 passed, 4 failed, 174 skipped (full report)


Failures on Postgres 16

  • test_heavy_write_workload[neon_on-github-actions-selfhosted-10-5-5]: release-x86-64
  • test_gc_feedback_with_snapshots[github-actions-selfhosted]: release-x86-64
  • test_pgbench[neon-github-actions-selfhosted-45-10]: release-x86-64
  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_heavy_write_workload[neon_on-release-pg16-github-actions-selfhosted-10-5-5] or test_gc_feedback_with_snapshots[release-pg16-github-actions-selfhosted] or test_pgbench[neon-release-pg16-github-actions-selfhosted-45-10] or test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted]"
Flaky tests (12)

Postgres 17

Postgres 16

Postgres 14

Code coverage* (full report)

  • functions: 32.0% (7493 of 23395 functions)
  • lines: 50.0% (60471 of 120849 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
d447f49 at 2024-09-25T21:32:27.403Z :recycle:

Please sign in to comment.