From 09750290d7a772088c35b402dca67156dc93bb40 Mon Sep 17 00:00:00 2001
From: Liu Xuezhao <xuezhao.liu@intel.com>
Date: Thu, 16 Jan 2025 23:15:09 +0800
Subject: [PATCH 1/7] DAOS-16943 rebuild: check DAOS_REBUILD ENV also for
 restart (#15728)

For restart, check DAOS_REBUILD ENV in ds_rebuild_regenerate_task
to make it be able to disable rebuild.

Signed-off-by: Xuezhao Liu <xuezhao.liu@hpe.com>
---
 src/pool/srv_pool.c |  4 +++-
 src/rebuild/srv.c   | 15 +++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 06a5f29e866..23053591449 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -1,5 +1,6 @@
 /*
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -7240,7 +7241,8 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
 	d_agetenv_str(&env, REBUILD_ENV);
 	if ((env && !strcasecmp(env, REBUILD_ENV_DISABLED)) ||
 	     daos_fail_check(DAOS_REBUILD_DISABLE)) {
-		D_DEBUG(DB_TRACE, "Rebuild is disabled\n");
+		D_DEBUG(DB_REBUILD, DF_UUID ": Rebuild is disabled for all pools\n",
+			DP_UUID(svc->ps_pool->sp_uuid));
 		d_freeenv_str(&env);
 		D_GOTO(out, rc = 0);
 	}
diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c
index 4b8cdc7c029..6c18f6077e6 100644
--- a/src/rebuild/srv.c
+++ b/src/rebuild/srv.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2245,11 +2246,21 @@ regenerate_task_of_type(struct ds_pool *pool, pool_comp_state_t match_states, ui
 int
 ds_rebuild_regenerate_task(struct ds_pool *pool, daos_prop_t *prop)
 {
-	struct daos_prop_entry	*entry;
-	int			rc = 0;
+	struct daos_prop_entry *entry;
+	char                   *env;
+	int                     rc = 0;
 
 	rebuild_gst.rg_abort = 0;
 
+	d_agetenv_str(&env, REBUILD_ENV);
+	if (env && !strcasecmp(env, REBUILD_ENV_DISABLED)) {
+		D_DEBUG(DB_REBUILD, DF_UUID ": Rebuild is disabled for all pools\n",
+			DP_UUID(pool->sp_uuid));
+		d_freeenv_str(&env);
+		return DER_SUCCESS;
+	}
+	d_freeenv_str(&env);
+
 	if (pool->sp_reint_mode == DAOS_REINT_MODE_NO_DATA_SYNC) {
 		D_DEBUG(DB_REBUILD, DF_UUID" No data sync for reintegration\n",
 			DP_UUID(pool->sp_uuid));

From 3ab84e25e09699bfd99ec13d4927ddf84ed99453 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 16 Jan 2025 10:50:06 -0800
Subject: [PATCH 2/7] DAOS-16957 cq: Bump github/codeql-action from 3.28.0 to
 3.28.1 (#15722)

Bumps the gha-versions group with 1 update: [github/codeql-action](https://github.com/github/codeql-action).


Updates `github/codeql-action` from 3.28.0 to 3.28.1
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/48ab28a6f5dbc2a99bf1e0131198dd8f1df78169...b6a472f63d85b9c78a3ac5e89422239fc15e9b3c)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: gha-versions
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/ossf-scorecard.yml | 2 +-
 .github/workflows/trivy.yml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml
index a51694de3eb..7525fb4658c 100644
--- a/.github/workflows/ossf-scorecard.yml
+++ b/.github/workflows/ossf-scorecard.yml
@@ -71,6 +71,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169  # v3.28.0
+        uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c  # v3.28.1
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 031b768f141..6ede1e011a6 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -58,7 +58,7 @@ jobs:
           trivy-config: 'utils/trivy/trivy.yaml'
 
       - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169  # v3.28.0
+        uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c  # v3.28.1
         with:
           sarif_file: 'trivy-results.sarif'
 

From 15842c69869f2bf76c1e631558e2231f1c2c96e1 Mon Sep 17 00:00:00 2001
From: wangdi <ddiwang@google.com>
Date: Thu, 16 Jan 2025 14:10:11 -0800
Subject: [PATCH 3/7] DAOS-16916 container: check inflight open (#15682)

Check inflight container open, which might be stucked in
IV fetch, then the following cont open will just increase
the open count, then if the previous container open failed,
it will get the assertion failure.

So let's retry if there are inflight container open.

Signed-off-by: Di Wang <ddiwang@google.com>
---
 src/container/srv_target.c       | 29 ++++++++++++++++++++++++++---
 src/include/daos_srv/container.h | 20 ++++++--------------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index 653cb5f7d37..80d6afb792c 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Google LLC
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -1607,11 +1608,23 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 		 */
 		D_ASSERT(hdl->sch_cont != NULL);
 		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
+
 		hdl->sch_cont->sc_open++;
+		if (hdl->sch_cont->sc_open > 1) {
+			/* If there is an inflight open being stuck, then
+			 * let's retry and wait until it finished.
+			 */
+			if (hdl->sch_cont->sc_open_initializing) {
+				hdl->sch_cont->sc_open--;
+				D_GOTO(err_cont, rc = -DER_AGAIN);
+			}
 
-		if (hdl->sch_cont->sc_open > 1)
-			goto opened;
+			/* Only go through if the 1st open succeeds */
+			if (hdl->sch_cont->sc_props_fetched)
+				goto opened;
+		}
 
+		hdl->sch_cont->sc_open_initializing = 1;
 		if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
 			goto csum_init;
 
@@ -1646,6 +1659,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 		rc = ds_cont_csummer_init(hdl->sch_cont);
 		if (rc != 0)
 			D_GOTO(err_dtx, rc);
+
+		hdl->sch_cont->sc_open_initializing = 0;
 	}
 opened:
 	if (cont_hdl != NULL) {
@@ -1663,6 +1678,7 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 	dtx_cont_close(hdl->sch_cont, true);
 
 err_cont:
+	hdl->sch_cont->sc_open_initializing = 0;
 	if (daos_handle_is_valid(poh)) {
 		int rc_tmp;
 
@@ -1750,9 +1766,15 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
 	D_DEBUG(DB_TRACE, "open pool/cont/hdl "DF_UUID"/"DF_UUID"/"DF_UUID"\n",
 		DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid));
 
+retry:
 	rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
 				       PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0);
-	if (rc != 0)
+	if (rc != 0) {
+		if (rc == -DER_AGAIN) {
+			dss_sleep(50);
+			goto retry;
+		}
+
 		/* Once it exclude the target from the pool, since the target
 		 * might still in the cart group, so IV cont open might still
 		 * come to this target, especially if cont open/close will be
@@ -1762,6 +1784,7 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
 		D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n",
 			DP_UUID(pool_uuid), DP_UUID(cont_uuid),
 			DP_UUID(cont_hdl_uuid), DP_RC(rc));
+	}
 
 	return rc;
 }
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index 9fc615c2a8b..6564a4c0150 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -1,5 +1,6 @@
 /*
  * (C) Copyright 2015-2024 Intel Corporation.
+ * (C) Copyright 2025 Google LLC
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -66,20 +67,11 @@ struct ds_cont_child {
 	ABT_cond		 sc_scrub_cond;
 	ABT_cond		 sc_rebuild_cond;
 	ABT_cond		 sc_fini_cond;
-	uint32_t		 sc_dtx_resyncing:1,
-				 sc_dtx_reindex:1,
-				 sc_dtx_reindex_abort:1,
-				 sc_dtx_delay_reset:1,
-				 sc_dtx_registered:1,
-				 sc_props_fetched:1,
-				 sc_stopping:1,
-				 sc_destroying:1,
-				 sc_vos_agg_active:1,
-				 sc_ec_agg_active:1,
-				 /* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
-				 sc_rw_disabled:1,
-				 sc_scrubbing:1,
-				 sc_rebuilding:1;
+	uint32_t                 sc_dtx_resyncing : 1, sc_dtx_reindex : 1, sc_dtx_reindex_abort : 1,
+	    sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_props_fetched : 1, sc_stopping : 1,
+	    sc_destroying : 1, sc_vos_agg_active : 1, sc_ec_agg_active : 1,
+	    /* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
+	    sc_rw_disabled : 1, sc_scrubbing : 1, sc_rebuilding : 1, sc_open_initializing : 1;
 	uint32_t		 sc_dtx_batched_gen;
 	/* Tracks the schedule request for aggregation ULT */
 	struct sched_request	*sc_agg_req;

From 4e0f12347caaa6a0f0df14e024b98afadf408293 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 17 Jan 2025 10:23:55 +0800
Subject: [PATCH 4/7] DAOS-16809 vos: container based stable epoch (#15605)

* DAOS-16809 vos: container based stable epoch

For the purpose of efficient calculating container based local stable epoch,
we will maintain some kind of sorted list for active DTX entries with epoch
order. But consider related overhead, it is not easy to maintain a strictly
sorted list for all active DTX entries. For the DTX which leader resides on
current target, its epoch is already sorted when generate on current engine.
So the main difficulty is for those DTX entries which leaders are on remote
targets.

On the other hand, the local stable epoch is mainly used to generate global
stable epoch that is for incremental reintegration. In fact, we do not need
a very accurate global stable epoch for incremental reintegration. It means
that it is no matter (or non-fatal) if the calculated stable epoch is a bit
smaller than the real case. For example, seconds error for the stable epoch
almost can be ignored if we compare such overhead with rebuilding the whole
target from scratch. So for the DTX entry which leader is on remote target,
we will maintain it in the list with relative incremental trend based on the
epoch instead of strict sorting the epoch. We introduce an O(1) algorithm to
handle such unsorted DTX entries list for calculating local stable epoch.

Main VOS APIs for the stable epoch:

/* Calculate current locally known stable epoch for the given container. */
daos_epoch_t vos_cont_get_local_stable_epoch(daos_handle_t coh);

/* Get global stable epoch for the given container. */
daos_epoch_t vos_cont_get_global_stable_epoch(daos_handle_t coh);

/* Set global stable epoch for the given container. */
int vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch);

Another important enhancement in the patch is about handling potential
conflict between EC/VOS aggregation and delayed modification with very
old epoch.

For standalone transaction, when it is started on the DTX leader, its epoch
is generated by the leader, then the modification RPC will be forwarded to
other related non-leader(s). If the forwarded RPC is delayed for some reason,
such as network congestion or system busy on the non-leader, as to the epoch
for such transaction becomes very old (exceed related threshold), as to VOS
aggregation may has already aggregated related epoch rang. Under such case,
the non-leader will reject such modification to avoid data lost/corruption.

For distributed transaction, if there is no read (fetch, query, enumerate,
and so on) before client commit_tx, then related DTX leader will generate
epoch for the transaction after client commit_tx. Then it will be the same
as above standalone transaction for epoch handling.

If the distributed transaction involves some read before client commit_tx,
its epoch will be generated by the first accessed engine for read. If the
transaction takes too long time after that, then when client commit_tx, its
epoch may become very old as to related DTX leader will have to reject the
transaction to avoid above mentioned conflict. And even if the DTX leader
did not reject the transaction, some non-leader may also reject it because
of the very old epoch. So it means that under such framework, the life for
a distributed transaction cannot be too long. That can be adjusted via the
server side environment variable DAOS_VOS_AGG_GAP. The default value is 60
seconds.

NOTE: EC/VOS aggregation should avoid aggregating in the epoch range where
      lots of data records are pending to commit, so the aggregation epoch
      upper bound is 'current HLC - vos_agg_gap'.


Signed-off-by: Fan Yong <fan.yong@hpe.com>
---
 src/container/srv_target.c                  |   9 +-
 src/dtx/dtx_common.c                        |   2 +
 src/dtx/tests/dts_structs.c                 |   4 +-
 src/engine/sched.c                          |  26 +--
 src/include/daos/dtx.h                      |  12 +-
 src/include/daos_srv/dtx_srv.h              |   7 +-
 src/include/daos_srv/vos.h                  |  51 +++++
 src/include/daos_srv/vos_types.h            |   3 +
 src/object/srv_obj.c                        |  14 +-
 src/tests/ftest/util/server_utils_params.py |   2 +
 src/utils/ddb/tests/ddb_test_driver.c       |   7 +-
 src/vos/tests/vts_dtx.c                     |   6 +-
 src/vos/tests/vts_io.c                      |  21 +-
 src/vos/tests/vts_mvcc.c                    |   3 +-
 src/vos/tests/vts_pm.c                      |  76 +++-----
 src/vos/vos_common.c                        |  46 +++++
 src/vos/vos_container.c                     | 204 ++++++++++++++++++++
 src/vos/vos_dtx.c                           | 133 ++++++++++++-
 src/vos/vos_internal.h                      |  33 ++++
 src/vos/vos_layout.h                        |   8 +-
 20 files changed, 573 insertions(+), 94 deletions(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index 80d6afb792c..e075b311ba8 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -1,6 +1,7 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
  * (C) Copyright 2025 Google LLC
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -322,7 +323,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
 		     DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_PEER_FAIL)))
 		interval = 0;
 	else
-		interval = d_sec2hlc(DAOS_AGG_THRESHOLD);
+		interval = d_sec2hlc(vos_get_agg_gap());
 
 	D_ASSERT(hlc > (interval * 2));
 	/*
@@ -410,6 +411,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
 			DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
 			tgt_id, epoch_range.epr_lo, epoch_range.epr_hi);
 
+		if (!param->ap_vos_agg)
+			vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi);
+
 		flags |= VOS_AGG_FL_FORCE_MERGE;
 		rc = agg_cb(cont, &epoch_range, flags, param);
 		if (rc)
@@ -426,6 +430,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
 		DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
 		tgt_id, epoch_range.epr_lo, epoch_range.epr_hi);
 
+	if (!param->ap_vos_agg)
+		vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi);
+
 	if (dss_xstream_is_busy())
 		flags &= ~VOS_AGG_FL_FORCE_MERGE;
 	rc = agg_cb(cont, &epoch_range, flags, param);
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ad6bac38432..8544579a537 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -922,6 +923,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch,
 	dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0;
 	dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0;
 	dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0;
+	dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0;
 	dth->dth_aborted = 0;
 	dth->dth_already = 0;
 	dth->dth_need_validation = 0;
diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c
index dc4347fed7c..f73eaad6e2f 100644
--- a/src/dtx/tests/dts_structs.c
+++ b/src/dtx/tests/dts_structs.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -70,8 +71,9 @@ struct_dtx_handle(void **state)
 	SET_BITFIELD_1(dummy, dth_need_validation);
 	SET_BITFIELD_1(dummy, dth_ignore_uncommitted);
 	SET_BITFIELD_1(dummy, dth_local);
+	SET_BITFIELD_1(dummy, dth_epoch_owner);
 	SET_BITFIELD_1(dummy, dth_local_complete);
-	SET_BITFIELD(dummy, padding1, 13);
+	SET_BITFIELD(dummy, padding1, 12);
 
 	SET_FIELD(dummy, dth_dti_cos_count);
 	SET_FIELD(dummy, dth_dti_cos);
diff --git a/src/engine/sched.c b/src/engine/sched.c
index e030bc8f74b..c9c34e09150 100644
--- a/src/engine/sched.c
+++ b/src/engine/sched.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -197,17 +198,6 @@ enum {
 
 static int	sched_policy;
 
-/*
- * Time threshold for giving IO up throttling. If space pressure stays in the
- * highest level for enough long time, we assume that no more space can be
- * reclaimed and choose to give up IO throttling, so that ENOSPACE error could
- * be returned to client earlier.
- *
- * To make time for aggregation reclaiming overwriteen space, this threshold
- * should be longer than the DAOS_AGG_THRESHOLD.
- */
-#define SCHED_DELAY_THRESH	40000	/* msecs */
-
 struct pressure_ratio {
 	unsigned int	pr_free;	/* free space ratio */
 	unsigned int	pr_gc_ratio;	/* CPU percentage for GC & Aggregation */
@@ -943,12 +933,22 @@ is_gc_pending(struct sched_pool_info *spi)
 	return spi->spi_gc_ults && (spi->spi_gc_ults > spi->spi_gc_sleeping);
 }
 
-/* Just run into this space pressure situation recently? */
+/*
+ * Just run into this space pressure situation recently?
+ *
+ * If space pressure stays in the highest level for enough long time, we assume
+ * that no more space can be reclaimed and choose to give up IO throttling, so
+ * that ENOSPACE error could be returned to client earlier.
+ *
+ * To make time for aggregation reclaiming overwriteen space, this threshold
+ * should be longer than VOS aggregation epoch gap against current HLC.
+ */
 static inline bool
 is_pressure_recent(struct sched_info *info, struct sched_pool_info *spi)
 {
 	D_ASSERT(info->si_cur_ts >= spi->spi_pressure_ts);
-	return (info->si_cur_ts - spi->spi_pressure_ts) < SCHED_DELAY_THRESH;
+	return (info->si_cur_ts - spi->spi_pressure_ts) <
+	       (vos_get_agg_gap() + 10) * 1000; /* msecs */
 }
 
 static inline uint64_t
diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h
index ca719077a14..8d28fc5f5f9 100644
--- a/src/include/daos/dtx.h
+++ b/src/include/daos/dtx.h
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -27,17 +28,6 @@
 /* The time (in second) threshold for batched DTX commit. */
 #define DTX_COMMIT_THRESHOLD_AGE	10
 
-/*
- * VOS aggregation should try to avoid aggregating in the epoch range where
- * lots of data records are pending to commit, so the aggregation epoch upper
- * bound is: current HLC - (DTX batched commit threshold + buffer period)
- *
- * To avoid conflicting of aggregation vs. transactions, any transactional
- * update/fetch with epoch lower than the aggregation upper bound should be
- * rejected and restarted.
- */
-#define DAOS_AGG_THRESHOLD	(DTX_COMMIT_THRESHOLD_AGE + 10) /* seconds */
-
 enum dtx_target_flags {
 	/* The target only contains read-only operations for the DTX. */
 	DTF_RDONLY			= (1 << 0),
diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h
index 7c60d2deaa0..34c1a5d8c89 100644
--- a/src/include/daos_srv/dtx_srv.h
+++ b/src/include/daos_srv/dtx_srv.h
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -113,8 +114,10 @@ struct dtx_handle {
 	    dth_ignore_uncommitted                : 1,
 	    /* Local transaction */
 	    dth_local                             : 1,
+	    /* Locally generate the epoch. */
+	    dth_epoch_owner			  : 1,
 	    /* Flag to commit the local transaction */
-	    dth_local_complete : 1, padding1 : 13;
+	    dth_local_complete : 1, padding1 : 12;
 
 	/* The count the DTXs in the dth_dti_cos array. */
 	uint32_t			 dth_dti_cos_count;
@@ -287,6 +290,8 @@ enum dtx_flags {
 	DTX_RELAY = (1 << 10),
 	/** Local transaction */
 	DTX_LOCAL = (1 << 11),
+	/** Locally generate the epoch. */
+	DTX_EPOCH_OWNER = (1 << 12),
 };
 
 void
diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h
index 4aeabd76947..536efc778ef 100644
--- a/src/include/daos_srv/vos.h
+++ b/src/include/daos_srv/vos.h
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2015-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -939,6 +940,56 @@ vos_update_renew_epoch(daos_handle_t ioh, struct dtx_handle *dth);
 void
 vos_dtx_renew_epoch(struct dtx_handle *dth);
 
+/**
+ * Calculate current locally known stable epoch for the given container.
+ *
+ * \param coh	[IN]	Container open handle
+ *
+ * \return		The epoch on success, negative value if error.
+ */
+daos_epoch_t
+vos_cont_get_local_stable_epoch(daos_handle_t coh);
+
+/**
+ * Get global stable epoch for the given container.
+ *
+ * \param coh	[IN]	Container open handle
+ *
+ * \return		The epoch on success, negative value if error.
+ */
+daos_epoch_t
+vos_cont_get_global_stable_epoch(daos_handle_t coh);
+
+/**
+ * Set global stable epoch for the given container.
+ *
+ * \param coh	[IN]	Container open handle
+ * \param epoch	[IN]	The epoch to be used as the new global stable epoch.
+ *
+ * \return		Zero on success, negative value if error.
+ */
+int
+vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch);
+
+/**
+ * Set the lowest allowed modification epoch for the given container.
+ *
+ * \param coh	[IN]	Container open handle
+ * \param epoch	[IN]	The lowest allowed epoch for modification.
+ *
+ * \return		Zero on success, negative value if error.
+ */
+int
+vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch);
+
+/**
+ * Query the gap between the max allowed aggregation epoch and current HLC.
+ *
+ * \return		The gap value in seconds.
+ */
+uint32_t
+vos_get_agg_gap(void);
+
 /**
  * Get the recx/epoch list.
  *
diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h
index a0178ca52db..4b9273f7038 100644
--- a/src/include/daos_srv/vos_types.h
+++ b/src/include/daos_srv/vos_types.h
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2015-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -58,6 +59,8 @@ enum dtx_entry_flags {
 	 * on all yet, need to be re-committed.
 	 */
 	DTE_PARTIAL_COMMITTED	= (1 << 5),
+	/* The DTX epoch is sorted locally. */
+	DTE_EPOCH_SORTED	= (1 << 6),
 };
 
 struct dtx_entry {
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index 6229b8ac51f..6b05b95d329 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2898,8 +2899,10 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 	rc = process_epoch(&orw->orw_epoch, &orw->orw_epoch_first,
 			   &orw->orw_flags);
-	if (rc == PE_OK_LOCAL)
+	if (rc == PE_OK_LOCAL) {
 		orw->orw_flags &= ~ORF_EPOCH_UNCERTAIN;
+		dtx_flags |= DTX_EPOCH_OWNER;
+	}
 
 	if (obj_rpc_is_fetch(rpc)) {
 		struct dtx_handle	*dth;
@@ -3858,8 +3861,10 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 
 	rc = process_epoch(&opi->opi_epoch, NULL /* epoch_first */,
 			   &opi->opi_flags);
-	if (rc == PE_OK_LOCAL)
+	if (rc == PE_OK_LOCAL) {
 		opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN;
+		dtx_flags |= DTX_EPOCH_OWNER;
+	}
 
 	version = opi->opi_map_ver;
 	max_ver = opi->opi_map_ver;
@@ -5112,6 +5117,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca)
 			   &dcsh->dcsh_epoch.oe_first,
 			   &dcsh->dcsh_epoch.oe_rpc_flags);
 	if (rc == PE_OK_LOCAL) {
+		dtx_flags |= DTX_EPOCH_OWNER;
 		/*
 		 * In this case, writes to local RDGs can use the chosen epoch
 		 * without any uncertainty. This optimization is left to future
@@ -5703,8 +5709,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	if (ocpi->ocpi_flags & ORF_LEADER) {
 		rc = process_epoch(&ocpi->ocpi_epoch, NULL /* epoch_first */, &ocpi->ocpi_flags);
-		if (rc == PE_OK_LOCAL)
+		if (rc == PE_OK_LOCAL) {
 			ocpi->ocpi_flags &= ~ORF_EPOCH_UNCERTAIN;
+			dtx_flags |= DTX_EPOCH_OWNER;
+		}
 	} else if (dct_nr == 1) {
 		rc = obj_coll_local(rpc, dcts[0].dct_shards, dce, &version, &ioc, NULL,
 				    odm->odm_mbs, obj_coll_tgt_punch);
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 19dd8ea4df3..ef870eed670 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -1,5 +1,6 @@
 """
   (C) Copyright 2020-2024 Intel Corporation.
+  (C) Copyright 2025 Hewlett Packard Enterprise Development LP
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -441,6 +442,7 @@ class EngineYamlParameters(YamlParameters):
             "D_LOG_FILE_APPEND_PID=1",
             "DAOS_POOL_RF=4",
             "CRT_EVENT_DELAY=1",
+            "DAOS_VOS_AGG_GAP=25",
             "COVFILE=/tmp/test.cov"],
         "ofi+tcp": [],
         "ofi+tcp;ofi_rxm": [],
diff --git a/src/utils/ddb/tests/ddb_test_driver.c b/src/utils/ddb/tests/ddb_test_driver.c
index c08cb821c65..07e0b0c8694 100644
--- a/src/utils/ddb/tests/ddb_test_driver.c
+++ b/src/utils/ddb/tests/ddb_test_driver.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2022-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -410,6 +411,7 @@ dvt_dtx_begin_helper(daos_handle_t coh, const daos_unit_oid_t *oid, daos_epoch_t
 	struct dtx_handle	*dth;
 	struct dtx_memberships	*mbs;
 	size_t			 size;
+	int			 rc;
 
 	D_ALLOC_PTR(dth);
 	assert_non_null(dth);
@@ -449,7 +451,8 @@ dvt_dtx_begin_helper(daos_handle_t coh, const daos_unit_oid_t *oid, daos_epoch_t
 	dth->dth_shares_inited = 1;
 
 	vos_dtx_rsrvd_init(dth);
-	vos_dtx_attach(dth, false, false);
+	rc = vos_dtx_attach(dth, false, false);
+	assert_rc_equal(rc, 0);
 
 	*dthp = dth;
 }
@@ -478,7 +481,7 @@ dvt_vos_insert_dtx_records(daos_handle_t coh, uint32_t nr, uint32_t committed_nr
 	daos_recx_t		 recxs[recxs_nr];
 	daos_iod_t		 iod = {0};
 	d_sg_list_t		 sgl = {0};
-	daos_epoch_t		 epoch = 1;
+	daos_epoch_t		 epoch = d_hlc_get();
 	uint64_t		 dkey_hash = 0x123;
 	int			 i;
 
diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c
index d83d2356d15..bbbde813113 100644
--- a/src/vos/tests/vts_dtx.c
+++ b/src/vos/tests/vts_dtx.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -43,6 +44,7 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch,
 	      uint64_t dkey_hash, struct dtx_handle **dthp)
 {
 	struct dtx_handle	*dth;
+	int			 rc;
 
 	D_ALLOC_PTR(dth);
 	assert_non_null(dth);
@@ -66,6 +68,7 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch,
 	dth->dth_for_migration = 0;
 	dth->dth_ignore_uncommitted = 0;
 	dth->dth_prepared = 0;
+	dth->dth_epoch_owner = 0;
 	dth->dth_aborted = 0;
 	dth->dth_already = 0;
 	dth->dth_need_validation = 0;
@@ -91,7 +94,8 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch,
 	dth->dth_shares_inited = 1;
 
 	vos_dtx_rsrvd_init(dth);
-	vos_dtx_attach(dth, false, false);
+	rc = vos_dtx_attach(dth, false, false);
+	assert_rc_equal(rc, 0);
 
 	*dthp = dth;
 }
diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c
index 14653c55425..f95af2f0719 100644
--- a/src/vos/tests/vts_io.c
+++ b/src/vos/tests/vts_io.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -175,7 +176,7 @@ test_args_init(struct io_test_args *args,
 	memset(args, 0, sizeof(*args));
 	memset(&vts_cntr, 0, sizeof(vts_cntr));
 
-	vts_epoch_gen = 1;
+	vts_epoch_gen = d_hlc_get();
 
 	rc = vts_ctx_init(&args->ctx, pool_size);
 	if (rc != 0)
@@ -1590,7 +1591,7 @@ vos_iterate_test(void **state)
 	struct all_info		info = {0};
 	vos_iter_param_t	param = {0};
 	struct vos_iter_anchors	anchors = {0};
-	daos_epoch_t		epoch = 1;
+	daos_epoch_t		epoch = d_hlc_get();
 	int			rc = 0;
 	unsigned long		old_flags = arg->ta_flags;
 
@@ -2038,7 +2039,7 @@ io_simple_one_key_cross_container(void **state)
 	d_sg_list_t		sgl;
 	daos_key_t		dkey;
 	daos_key_t		akey;
-	daos_epoch_t		epoch = gen_rand_epoch();
+	daos_epoch_t		epoch;
 	daos_unit_oid_t		l_oid;
 
 	/* Creating an additional container */
@@ -2087,6 +2088,7 @@ io_simple_one_key_cross_container(void **state)
 	iod.iod_type	= DAOS_IOD_ARRAY;
 
 	l_oid = gen_oid(arg->otype);
+	epoch = gen_rand_epoch();
 	rc  = vos_obj_update(arg->ctx.tc_co_hdl, arg->oid, epoch, 0, 0, &dkey,
 			     1, &iod, NULL, &sgl);
 	if (rc) {
@@ -2526,7 +2528,7 @@ oid_iter_test_with_anchor(void **state)
 #define KEY_INC		127
 #define MAX_INT_KEY	(NUM_KEYS * KEY_INC)
 
-static void gen_query_tree(struct io_test_args *arg, daos_unit_oid_t oid)
+static void gen_query_tree(struct io_test_args *arg, daos_unit_oid_t oid, daos_epoch_t epoch)
 {
 	daos_iod_t		iod = {0};
 	d_sg_list_t		sgl = {0};
@@ -2534,7 +2536,6 @@ static void gen_query_tree(struct io_test_args *arg, daos_unit_oid_t oid)
 	daos_key_t		akey;
 	d_iov_t			val_iov;
 	daos_recx_t		recx;
-	daos_epoch_t		epoch = 1;
 	uint64_t		dkey_value;
 	uint64_t		akey_value;
 	int			i, j;
@@ -2608,7 +2609,7 @@ io_query_key(void **state)
 	int			i, j;
 	struct dtx_handle	*dth;
 	struct dtx_id		xid;
-	daos_epoch_t		epoch = 1;
+	daos_epoch_t		epoch = d_hlc_get();
 	daos_key_t		dkey;
 	daos_key_t		akey;
 	daos_key_t		dkey_read;
@@ -2623,7 +2624,7 @@ io_query_key(void **state)
 
 	oid = gen_oid(arg->otype);
 
-	gen_query_tree(arg, oid);
+	gen_query_tree(arg, oid, epoch);
 
 	for (i = 1; i <= NUM_KEYS; i++) {
 		for (j = 1; j <= NUM_KEYS; j++) {
@@ -2873,7 +2874,7 @@ io_query_key_punch_update(void **state)
 {
 	struct io_test_args	*arg = *state;
 	int			rc = 0;
-	daos_epoch_t		epoch = 1;
+	daos_epoch_t		epoch = d_hlc_get();
 	daos_key_t		dkey = { 0 };
 	daos_key_t		akey;
 	daos_recx_t		recx_read;
@@ -2949,7 +2950,7 @@ io_query_key_negative(void **state)
 			       &recx_read, NULL, 0, 0, NULL);
 	assert_rc_equal(rc, -DER_NONEXIST);
 
-	gen_query_tree(arg, oid);
+	gen_query_tree(arg, oid, d_hlc_get());
 
 	rc = vos_obj_query_key(arg->ctx.tc_co_hdl, arg->oid,
 			       DAOS_GET_DKEY | DAOS_GET_MAX, 4,
@@ -3006,7 +3007,7 @@ gang_sv_test(void **state)
 	char			dkey_buf[UPDATE_DKEY_SIZE], akey_buf[UPDATE_AKEY_SIZE];
 	char			*update_buf, *fetch_buf;
 	daos_size_t		rsize = (27UL << 20);	/* 27MB */
-	daos_epoch_t		epoch = 1;
+	daos_epoch_t		epoch = d_hlc_get();
 	int			rc;
 
 	D_ALLOC(update_buf, rsize);
diff --git a/src/vos/tests/vts_mvcc.c b/src/vos/tests/vts_mvcc.c
index 907b6957cf1..f6f1e40dc5d 100644
--- a/src/vos/tests/vts_mvcc.c
+++ b/src/vos/tests/vts_mvcc.c
@@ -1,5 +1,6 @@
 /*
  * (C) Copyright 2020-2023 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -1720,7 +1721,7 @@ setup_mvcc(void **state)
 	D_ASSERT(arg->custom == NULL);
 	D_ALLOC_PTR(mvcc_arg);
 	D_ASSERT(mvcc_arg != NULL);
-	mvcc_arg->epoch = 500;
+	mvcc_arg->epoch = d_hlc_get() + 500;
 	d_getenv_bool("CMOCKA_TEST_ABORT", &mvcc_arg->fail_fast);
 	arg->custom = mvcc_arg;
 	return 0;
diff --git a/src/vos/tests/vts_pm.c b/src/vos/tests/vts_pm.c
index 7df7b39da27..e22551d79b0 100644
--- a/src/vos/tests/vts_pm.c
+++ b/src/vos/tests/vts_pm.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2022 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -25,7 +26,6 @@
 #endif
 
 
-static int start_epoch = 5;
 #define BUF_SIZE 2000
 static int buf_size = BUF_SIZE;
 struct pm_info {
@@ -1271,11 +1271,11 @@ cond_test(void **state)
 	daos_unit_oid_t		 oid;
 	d_sg_list_t		 sgl[MAX_SGL] = {0};
 	d_iov_t			 iov[MAX_SGL];
-	daos_epoch_t		 epoch = start_epoch;
+	daos_epoch_t		 epoch;
 	int			 i;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get() + 1000;
 	oid = gen_oid(0);
 
 	for (i = 0; i < MAX_SGL; i++) {
@@ -1369,8 +1369,6 @@ cond_test(void **state)
 			0, -DER_NO_PERM, sgl, 5, "new",
 			"foo", "f", "bar", "d", "val", "e", "flag", "new",
 			"temp");
-
-	start_epoch = epoch + 1;
 }
 
 /** Making the oid generation deterministic, I get to 18201 before I hit a false
@@ -1386,12 +1384,11 @@ multiple_oid_cond_test(void **state)
 	daos_unit_oid_t		 oid;
 	d_sg_list_t		 sgl = {0};
 	d_iov_t			 iov = {0};
-	daos_epoch_t		 epoch = start_epoch + NUM_OIDS * 3;
+	daos_epoch_t		 epoch;
 	int			 i;
 
-	start_epoch = epoch + 1;
-
 	test_args_reset(arg, VPOOL_SIZE);
+	epoch = d_hlc_get() + NUM_OIDS * 3;
 	sgl.sg_iovs = &iov;
 	sgl.sg_nr = 1;
 	sgl.sg_nr_out = 1;
@@ -1495,13 +1492,13 @@ remove_test(void **state)
 	d_sg_list_t		 sgl;
 	daos_recx_t		 recx[SM_BUF_LEN];
 	daos_unit_oid_t		 oid;
-	daos_epoch_t		 epoch = start_epoch;
+	daos_epoch_t		 epoch;
 	int			 rc = 0;
 	char			 key1 = 'a';
 	char			 key2 = 'b';
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	oid = gen_oid(0);
 
 	d_iov_set(&dkey, &key1, sizeof(key1));
@@ -1594,8 +1591,6 @@ remove_test(void **state)
 		    FETCH_DATA, 1, &REM_VAL1[0], FETCH_HOLE,
 		    sizeof(REM_VAL1) + sizeof(REM_VAL2) + sizeof(REM_VAL3) - 5,
 		    FETCH_DATA, 1, &REM_VAL3[sizeof(REM_VAL3) - 2], FETCH_END);
-
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -1670,7 +1665,7 @@ minor_epoch_punch_sv(void **state)
 	daos_recx_t		rex;
 	daos_iod_t		iod;
 	d_sg_list_t		sgl;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	struct dtx_handle	*dth;
 	struct dtx_id		 xid;
 	const char		*expected = "xxxxx";
@@ -1681,7 +1676,7 @@ minor_epoch_punch_sv(void **state)
 	daos_unit_oid_t		oid;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	memset(&rex, 0, sizeof(rex));
 	memset(&iod, 0, sizeof(iod));
 
@@ -1741,7 +1736,6 @@ minor_epoch_punch_sv(void **state)
 	assert_memory_equal(buf, expected, strlen(expected));
 
 	d_sgl_fini(&sgl, false);
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -1754,7 +1748,7 @@ minor_epoch_punch_array(void **state)
 	daos_recx_t		rex;
 	daos_iod_t		iod;
 	d_sg_list_t		sgl;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	struct dtx_handle	*dth;
 	struct dtx_id		 xid;
 	const char		*expected = "xxxxxLonelyWorld";
@@ -1766,7 +1760,7 @@ minor_epoch_punch_array(void **state)
 	daos_unit_oid_t		oid;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	memset(&rex, 0, sizeof(rex));
 	memset(&iod, 0, sizeof(iod));
 
@@ -1837,7 +1831,6 @@ minor_epoch_punch_array(void **state)
 	assert_memory_equal(buf, expected, strlen(expected));
 
 	d_sgl_fini(&sgl, false);
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -1850,7 +1843,7 @@ minor_epoch_punch_rebuild(void **state)
 	daos_recx_t		rex;
 	daos_iod_t		iod;
 	d_sg_list_t		sgl;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	const char		*expected = "xxxxxlonelyworld";
 	const char		*first = "hello";
 	const char		*second = "lonelyworld";
@@ -1860,7 +1853,7 @@ minor_epoch_punch_rebuild(void **state)
 	daos_unit_oid_t		oid;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	memset(&rex, 0, sizeof(rex));
 	memset(&iod, 0, sizeof(iod));
 
@@ -1930,8 +1923,6 @@ minor_epoch_punch_rebuild(void **state)
 	epoch += 2;
 
 	d_sgl_fini(&sgl, false);
-
-	start_epoch = epoch + 1;
 }
 
 #define NUM_RANKS 100
@@ -1948,7 +1939,7 @@ many_keys(void **state)
 	daos_recx_t		rex;
 	daos_iod_t		iod;
 	d_sg_list_t		sgl;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch = d_hlc_get();
 	const char		*w = "x";
 	char			*dkey_buf = DKEY_NAME;
 	char			akey_buf[UPDATE_DKEY_SIZE];
@@ -1995,8 +1986,6 @@ many_keys(void **state)
 	}
 
 	d_sgl_fini(&sgl, false);
-
-	start_epoch = epoch + 1;
 }
 
 #define CELL_SZ 2
@@ -2102,7 +2091,7 @@ ec_size(void **state)
 	struct io_test_args	*arg = *state;
 	int			rc = 0;
 	d_sg_list_t		sgl;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	const char		w[] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
 	daos_unit_oid_t		oid;
 	uint64_t		size;
@@ -2110,7 +2099,7 @@ ec_size(void **state)
 	int			i;
 
 	test_args_reset(arg, VPOOL_1G);
-
+	epoch = d_hlc_get();
 	rc = d_sgl_init(&sgl, 1);
 	assert_rc_equal(rc, 0);
 
@@ -2201,8 +2190,6 @@ ec_size(void **state)
 	assert_int_equal(size, 201 * STRIPE_SZ);
 
 	d_sgl_fini(&sgl, false);
-
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -2219,7 +2206,7 @@ test_inprogress_parent_punch(void **state)
 	d_sg_list_t		sgl;
 	struct dtx_handle	*dth1;
 	struct dtx_handle	*dth2;
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	struct dtx_id		xid1;
 	struct dtx_id		xid2;
 	const char		*expected = "xxxxx";
@@ -2232,7 +2219,7 @@ test_inprogress_parent_punch(void **state)
 	daos_unit_oid_t		oid;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	memset(&rex, 0, sizeof(rex));
 	memset(&iod, 0, sizeof(iod));
 
@@ -2336,8 +2323,6 @@ test_inprogress_parent_punch(void **state)
 	assert_memory_equal(buf, expected, strlen(expected));
 
 	d_sgl_fini(&sgl, false);
-
-	start_epoch = epoch + 1;
 }
 
 #define NR_OBJ 10
@@ -2458,9 +2443,9 @@ many_tx(void **state)
 	d_sg_list_t		sgl;
 	d_sg_list_t		fetch_sgl;
 	char			buf[32];
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	daos_handle_t		coh;
-	daos_epoch_range_t	epr = {epoch, epoch};
+	daos_epoch_range_t	epr;
 	struct vos_ioreq	req[NR_TX] = {0};
 	const char		*first = "Hello";
 	char			dkey_buf[NR_DKEY][UPDATE_DKEY_SIZE];
@@ -2482,9 +2467,12 @@ many_tx(void **state)
 
 	test_args_reset(arg, VPOOL_SIZE);
 	coh = arg->ctx.tc_co_hdl;
-
 	memset(&iod, 0, sizeof(iod));
 
+	epoch = d_hlc_get() + 1000;
+	epr.epr_lo = epoch;
+	epr.epr_hi = epoch;
+
 	rc = d_sgl_init(&sgl, 1);
 	assert_rc_equal(rc, 0);
 	rc = d_sgl_init(&fetch_sgl, 1);
@@ -2623,7 +2611,6 @@ many_tx(void **state)
 
 	d_sgl_fini(&sgl, false);
 	d_sgl_fini(&fetch_sgl, false);
-	start_epoch = epoch + 1;
 }
 
 static struct dtx_id
@@ -2685,7 +2672,7 @@ uncommitted_parent(void **state)
 	daos_iod_t		iod;
 	d_sg_list_t		sgl;
 	char			buf[32];
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	daos_handle_t		coh;
 	char			*first = "Hello";
 	char			dkey_buf[UPDATE_DKEY_SIZE];
@@ -2695,7 +2682,7 @@ uncommitted_parent(void **state)
 
 	test_args_reset(arg, VPOOL_SIZE);
 	coh = arg->ctx.tc_co_hdl;
-
+	epoch = d_hlc_get();
 	memset(&iod, 0, sizeof(iod));
 
 	rc = d_sgl_init(&sgl, 1);
@@ -2735,7 +2722,6 @@ uncommitted_parent(void **state)
 	assert_memory_equal(buf, first, 5);
 
 	d_sgl_fini(&sgl, false);
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -2749,7 +2735,7 @@ test_uncommitted_key(void **state)
 	daos_iod_t           iod;
 	d_sg_list_t          sgl;
 	char                 buf[32];
-	daos_epoch_t         epoch = start_epoch;
+	daos_epoch_t         epoch;
 	daos_handle_t        coh;
 	char                *first = "Hello";
 	char                 dkey_buf[UPDATE_DKEY_SIZE];
@@ -2759,7 +2745,7 @@ test_uncommitted_key(void **state)
 
 	test_args_reset(arg, VPOOL_SIZE);
 	coh = arg->ctx.tc_co_hdl;
-
+	epoch = d_hlc_get();
 	memset(&iod, 0, sizeof(iod));
 
 	rc = d_sgl_init(&sgl, 1);
@@ -2797,7 +2783,6 @@ test_uncommitted_key(void **state)
 	assert_memory_equal(buf, "Hello", 5);
 
 	d_sgl_fini(&sgl, false);
-	start_epoch = epoch + 1;
 }
 
 static void
@@ -2811,7 +2796,7 @@ test_multiple_key_conditionals_common(void **state, bool with_dtx)
 	daos_recx_t		rex[2] = {0};
 	daos_iod_t		iod[2] = {0};
 	d_sg_list_t		sgl[2] = {0};
-	daos_epoch_t		epoch = start_epoch;
+	daos_epoch_t		epoch;
 	struct dtx_handle	*dth = NULL;
 	struct dtx_id		 xid;
 	const char		*expected = "xxxxx";
@@ -2825,7 +2810,7 @@ test_multiple_key_conditionals_common(void **state, bool with_dtx)
 	daos_unit_oid_t		oid;
 
 	test_args_reset(arg, VPOOL_SIZE);
-
+	epoch = d_hlc_get();
 	memset(rex, 0, sizeof(rex));
 	memset(iod, 0, sizeof(iod));
 
@@ -3010,7 +2995,6 @@ test_multiple_key_conditionals_common(void **state, bool with_dtx)
 	if (with_dtx)
 		vts_dtx_end(dth);
 
-	start_epoch = epoch + 1;
 	d_sgl_fini(&sgl[0], false);
 	d_sgl_fini(&sgl[1], false);
 }
diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c
index fe68aa40340..4713971d65c 100644
--- a/src/vos/vos_common.c
+++ b/src/vos/vos_common.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -620,6 +621,42 @@ struct dss_module_key vos_module_key = {
 
 daos_epoch_t	vos_start_epoch = DAOS_EPOCH_MAX;
 
+/*
+ * For standalone transaction, when it is started on the DTX leader, its epoch
+ * is generated by the leader, then the modification RPC will be forwarded to
+ * other related non-leader(s). If the forwarded RPC is delayed for some reason,
+ * such as network congestion or system busy on the non-leader, as to the epoch
+ * for such transaction becomes very old (exceed related threshold), as to VOS
+ * aggregation may has already aggregated related epoch rang. Under such case,
+ * the non-leader will reject such modification to avoid data lost/corruption.
+ *
+ * For distributed transaction, if there is no read (fetch, query, enumerate,
+ * and so on) before client commit_tx, then related DTX leader will generate
+ * epoch for the transaction after client commit_tx. Then it will be the same
+ * as above standalone transaction for epoch handling.
+ *
+ * If the distributed transaction involves some read before client commit_tx,
+ * its epoch will be generated by the first accessed engine for read. If the
+ * transaction takes too long time after that, then when client commit_tx, its
+ * epoch may become very old as to related DTX leader will have to reject the
+ * transaction to avoid above mentioned conflict. And even if the DTX leader
+ * did not reject the transaction, some non-leader may also reject it because
+ * of the very old epoch. So it means that under such framework, the life for
+ * a distributed transaction cannot be too long. That can be adjusted via the
+ * server side environment variable DAOS_VOS_AGG_GAP.
+ *
+ * NOTE: EC/VOS aggregation should avoid aggregating in the epoch range where
+ *	 lots of data records are pending to commit, so the aggregation epoch
+ *	 upper bound is 'current HLC - vos_agg_gap'.
+ */
+uint32_t	vos_agg_gap;
+
+uint32_t
+vos_get_agg_gap(void)
+{
+	return vos_agg_gap;
+}
+
 static int
 vos_mod_init(void)
 {
@@ -679,6 +716,15 @@ vos_mod_init(void)
 	d_getenv_bool("DAOS_DKEY_PUNCH_PROPAGATE", &vos_dkey_punch_propagate);
 	D_INFO("DKEY punch propagation is %s\n", vos_dkey_punch_propagate ? "enabled" : "disabled");
 
+	vos_agg_gap = VOS_AGG_GAP_DEF;
+	d_getenv_uint("DAOS_VOS_AGG_GAP", &vos_agg_gap);
+	if (vos_agg_gap < VOS_AGG_GAP_MIN || vos_agg_gap > VOS_AGG_GAP_MAX) {
+		D_WARN("Invalid DAOS_VOS_AGG_GAP value, "
+		       "valid range [%u, %u], set it as default %u (second)\n",
+		       VOS_AGG_GAP_MIN, VOS_AGG_GAP_MAX, VOS_AGG_GAP_DEF);
+		vos_agg_gap = VOS_AGG_GAP_DEF;
+	}
+	D_INFO("Set DAOS VOS aggregation gap as %u (second)\n", vos_agg_gap);
 
 	return rc;
 }
diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c
index 8f1069878c1..b6294ad3031 100644
--- a/src/vos/vos_container.c
+++ b/src/vos/vos_container.c
@@ -199,6 +199,9 @@ cont_free_internal(struct vos_container *cont)
 		lrua_array_free(cont->vc_dtx_array);
 
 	D_ASSERT(d_list_empty(&cont->vc_dtx_act_list));
+	D_ASSERT(d_list_empty(&cont->vc_dtx_sorted_list));
+	D_ASSERT(d_list_empty(&cont->vc_dtx_unsorted_list));
+	D_ASSERT(d_list_empty(&cont->vc_dtx_reindex_list));
 
 	dbtree_close(cont->vc_btr_hdl);
 
@@ -394,6 +397,9 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh)
 		cont->vc_cmt_dtx_indexed = 0;
 	cont->vc_cmt_dtx_reindex_pos = cont->vc_cont_df->cd_dtx_committed_head;
 	D_INIT_LIST_HEAD(&cont->vc_dtx_act_list);
+	D_INIT_LIST_HEAD(&cont->vc_dtx_sorted_list);
+	D_INIT_LIST_HEAD(&cont->vc_dtx_unsorted_list);
+	D_INIT_LIST_HEAD(&cont->vc_dtx_reindex_list);
 	cont->vc_dtx_committed_count = 0;
 	cont->vc_solo_dtx_epoch = d_hlc_get();
 	rc = gc_open_cont(cont);
@@ -460,6 +466,21 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh)
 		}
 	}
 
+	/*
+	 * Assign vc_mod_epoch_bound with current HLC, then all former reported local stable
+	 * epoch (without persistently stored) before re-opening the container will be older
+	 * than vc_mod_epoch_bound. It is possible that some modification was started before
+	 * current container reopen (such as for engine restart without related pool service
+	 * down), but related RPC was not forwarded to current engine in time. After current
+	 * engine re-opening the container (shard), it will reject such old modification and
+	 * ask related DTX leader to restart the transaction. It only may affect inflight IO
+	 * during re-opening container without restarting pool service.
+	 *
+	 * With the assignment, we also do not need to consider former EC/VOS aggregation up
+	 * boundary when reopen the container.
+	 */
+	cont->vc_mod_epoch_bound = d_hlc_get();
+
 	rc = vos_dtx_act_reindex(cont);
 	if (rc != 0) {
 		D_ERROR("Fail to reindex active DTX entries: %d\n", rc);
@@ -814,3 +835,186 @@ struct vos_iter_ops vos_cont_iter_ops = {
 	.iop_fetch   = cont_iter_fetch,
 	.iop_process  = cont_iter_process,
 };
+
+/*
+ * The local stable epoch can be used to calculate global stable epoch: all the container
+ * shards report each own local stable epoch to some leader who will find out the smallest
+ * one as the global stable epoch and dispatch it to all related container shards.
+ */
+daos_epoch_t
+vos_cont_get_local_stable_epoch(daos_handle_t coh)
+{
+	struct vos_container	*cont;
+	struct vos_dtx_act_ent	*dae;
+	uint64_t		 gap = d_sec2hlc(vos_agg_gap);
+	daos_epoch_t		 epoch = d_hlc_get() - gap;
+
+	cont = vos_hdl2cont(coh);
+	D_ASSERT(cont != NULL);
+
+	/*
+	 * If the oldest (that is at the head of the sorted list) sorted DTX's
+	 * epoch is out of the boundary, then use it as the local stable epoch.
+	 */
+	if (!d_list_empty(&cont->vc_dtx_sorted_list)) {
+		dae = d_list_entry(cont->vc_dtx_sorted_list.next,
+				   struct vos_dtx_act_ent, dae_order_link);
+		if (epoch >= DAE_EPOCH(dae))
+			epoch = DAE_EPOCH(dae) - 1;
+	}
+
+	/*
+	 * It is not easy to know which DTX is the oldest one in the unsorted list.
+	 * The one after the header in the list maybe older than the header. But the
+	 * epoch difference will NOT exceed 'vos_agg_gap' since any DTX with older
+	 * epoch will be rejected (and restart with newer epoch).
+	 *
+	 * So "DAE_EPOCH(header) - vos_agg_gap" can be used to estimate the local
+	 * stable epoch for unsorted DTX entries.
+	 */
+	if (!d_list_empty(&cont->vc_dtx_unsorted_list)) {
+		dae = d_list_entry(cont->vc_dtx_unsorted_list.next,
+				   struct vos_dtx_act_ent, dae_order_link);
+		if (epoch > DAE_EPOCH(dae) - gap)
+			epoch = DAE_EPOCH(dae) - gap;
+	}
+
+	/*
+	 * The historical vos_agg_gap for the DTX entries in the reindex list is unknown.
+	 * We use cont->vc_dtx_reindex_eph_diff to estimate the local stable epoch. That
+	 * may be over-estimated. Usually, the count of re-indexed DTX entries is quite
+	 * limited, and will be purged soon after the container opened (via DTX resync).
+	 * So it will not much affect the local stable epoch calculation.
+	 */
+	if (unlikely(!d_list_empty(&cont->vc_dtx_reindex_list))) {
+		dae = d_list_entry(cont->vc_dtx_reindex_list.next,
+				   struct vos_dtx_act_ent, dae_order_link);
+		if (epoch > DAE_EPOCH(dae) - cont->vc_dtx_reindex_eph_diff)
+			epoch = DAE_EPOCH(dae) - cont->vc_dtx_reindex_eph_diff;
+	}
+
+	/*
+	 * vc_mod_epoch_bound guarantee that no modification with older epoch after last
+	 * reporting local stable epoch can be accepted. So if the new calculated result
+	 * is older, then reuse the former one.
+	 */
+	if (unlikely(epoch < cont->vc_local_stable_epoch))
+		epoch = cont->vc_local_stable_epoch;
+	else
+		cont->vc_local_stable_epoch = epoch;
+
+	/*
+	 * Update vc_mod_epoch_bound to guarantee that on update with older epoch can be
+	 * acceptable after reporting the new local stable epoch. The semantics maybe so
+	 * strict as to a lot of DTX restart.
+	 */
+	if (cont->vc_mod_epoch_bound < epoch) {
+		D_DEBUG(DB_TRACE, "Increase acceptable modification boundary from "
+			DF_X64 " to " DF_X64 " for container " DF_UUID "\n",
+			cont->vc_mod_epoch_bound, epoch, DP_UUID(cont->vc_id));
+		cont->vc_mod_epoch_bound = epoch;
+	}
+
+	return epoch;
+}
+
+/*
+ * The global stable epoch can be used for incremental reintegration: all the modifications
+ * involved in current target (container shard) under the global stable epoch have already
+ * been persistently stored globally, only need to care about the modification with newer
+ * epoch when reintegrate into the system.
+ */
+daos_epoch_t
+vos_cont_get_global_stable_epoch(daos_handle_t coh)
+{
+	struct vos_container	*cont;
+	struct vos_cont_ext_df	*cont_ext;
+	daos_epoch_t		 epoch = 0;
+
+	cont = vos_hdl2cont(coh);
+	D_ASSERT(cont != NULL);
+
+	cont_ext = umem_off2ptr(vos_cont2umm(cont), cont->vc_cont_df->cd_ext);
+	if (cont_ext != NULL)
+		epoch = cont_ext->ced_global_stable_epoch;
+
+	return epoch;
+}
+
+int
+vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch)
+{
+	struct umem_instance	*umm;
+	struct vos_container	*cont;
+	struct vos_cont_ext_df	*cont_ext;
+	daos_epoch_t		 old = 0;
+	int			 rc = 0;
+
+	cont = vos_hdl2cont(coh);
+	D_ASSERT(cont != NULL);
+
+	umm = vos_cont2umm(cont);
+	cont_ext = umem_off2ptr(umm, cont->vc_cont_df->cd_ext);
+
+	/* Do not allow to set global stable epoch against old container without extension. */
+	if (cont_ext == NULL)
+		D_GOTO(out, rc = -DER_NOTSUPPORTED);
+
+	/*
+	 * Either the leader gives wrong global stable epoch or current target does not participant
+	 * in the calculating new globle stable epoch. Then do not allow to set globle stable epoch.
+	 */
+	if (unlikely(cont->vc_local_stable_epoch < epoch)) {
+		D_WARN("Invalid global stable epoch: " DF_X64" vs " DF_X64 " for container "
+		       DF_UUID "\n", cont->vc_local_stable_epoch, epoch, DP_UUID(cont->vc_id));
+		D_GOTO(out, rc = -DER_NO_PERM);
+	}
+
+	if (unlikely(cont_ext->ced_global_stable_epoch > epoch)) {
+		D_WARN("Do not allow to rollback global stable epoch from "
+		       DF_X64" to " DF_X64 " for container " DF_UUID "\n",
+		       cont_ext->ced_global_stable_epoch, epoch, DP_UUID(cont->vc_id));
+		D_GOTO(out, rc = -DER_NO_PERM);
+	}
+
+	if (cont_ext->ced_global_stable_epoch == epoch)
+		D_GOTO(out, rc = 0);
+
+	old = cont_ext->ced_global_stable_epoch;
+	rc = umem_tx_begin(umm, NULL);
+	if (rc == 0) {
+		rc = umem_tx_add_ptr(umm, &cont_ext->ced_global_stable_epoch,
+				     sizeof(cont_ext->ced_global_stable_epoch));
+		if (rc == 0) {
+			cont_ext->ced_global_stable_epoch = epoch;
+			rc = umem_tx_commit(vos_cont2umm(cont));
+		} else {
+			rc = umem_tx_abort(umm, rc);
+		}
+	}
+
+	DL_CDEBUG(rc != 0, DLOG_ERR, DB_MGMT, rc,
+		  "Set global stable epoch from "DF_X64" to " DF_X64 " for container " DF_UUID,
+		  old , epoch, DP_UUID(cont->vc_id));
+
+out:
+	return rc;
+}
+
+int
+vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch)
+{
+	struct vos_container	*cont;
+
+	cont = vos_hdl2cont(coh);
+	D_ASSERT(cont != NULL);
+
+	if (cont->vc_mod_epoch_bound < epoch) {
+		D_DEBUG(DB_TRACE, "Increase acceptable modification boundary from "
+			DF_X64 " to " DF_X64 " for container " DF_UUID "\n",
+			cont->vc_mod_epoch_bound, epoch, DP_UUID(cont->vc_id));
+		cont->vc_mod_epoch_bound = epoch;
+	}
+
+	return 0;
+}
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index bf08a34ba1e..5ed24e20909 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -264,8 +264,10 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec,
 	dae = umem_off2ptr(&tins->ti_umm, rec->rec_off);
 	rec->rec_off = UMOFF_NULL;
 
-	if (dae != NULL)
+	if (dae != NULL) {
+		d_list_del_init(&dae->dae_order_link);
 		d_list_del_init(&dae->dae_link);
+	}
 
 	if (args != NULL) {
 		/* Return the record addreass (offset in DRAM).
@@ -1019,11 +1021,76 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth)
 	uint32_t			 idx;
 	d_iov_t				 kiov;
 	d_iov_t				 riov;
+	uint64_t			 now;
 	int				 rc = 0;
 
 	cont = vos_hdl2cont(dth->dth_coh);
 	D_ASSERT(cont != NULL);
 
+	/* Do not allow the modification with too old epoch. */
+	if (dth->dth_epoch <= cont->vc_mod_epoch_bound) {
+		now = daos_gettime_coarse();
+		if (now - cont->vc_dtx_reject_ts > 10) {
+			D_WARN("Reject DTX (1) " DF_DTI " with epoch " DF_X64
+			       " vs bound " DF_X64 "\n", DP_DTI(&dth->dth_xid),
+			       dth->dth_epoch, cont->vc_mod_epoch_bound);
+			cont->vc_dtx_reject_ts = now;
+		}
+		return -DER_TX_RESTART;
+	}
+
+	/*
+	 * NOTE: For the purpose of efficient calculating container based local stable epoch,
+	 *	 we will maintain some kind of sorted list for active DTX entries with epoch
+	 *	 order. But consider related overhead, it is not easy to maintain a strictly
+	 *	 sorted list for all active DTX entries. For the DTX which leader resides on
+	 *	 current target, its epoch is already sorted when generate on current engine.
+	 *	 So the main difficulty is for those DTX entries which leaders are on remote
+	 *	 targets.
+	 *
+	 *	 On the other hand, the local stable epoch is mainly used to generate global
+	 *	 stable epoch that is for incremental reintegration. In fact, we do not need
+	 *	 a very accurate global stable epoch for incremental reintegration. It means
+	 *	 that it is no matter (or non-fatal) if the calculated stable epoch is a bit
+	 *	 smaller than the real case. For example, seconds error for the stable epoch
+	 *	 almost can be ignored if we compare such overhead with rebuilding the whole
+	 *	 target from scratch. So for the DTX entry which leader is on remote target,
+	 *	 we will maintain it in the list with relative incremental trend based on the
+	 *	 epoch instead of strict sorting the epoch. We introduce an O(1) algorithm to
+	 *	 handle such unsorted DTX entries list.
+	 *
+	 *	 For distributed transaction, its epoch may be generated on non-leader.
+	 */
+
+	if (!dth->dth_epoch_owner && !d_list_empty(&cont->vc_dtx_unsorted_list)) {
+		dae = d_list_entry(cont->vc_dtx_unsorted_list.prev, struct vos_dtx_act_ent,
+				   dae_order_link);
+		if (dth->dth_epoch < DAE_EPOCH(dae) &&
+		    cont->vc_mod_epoch_bound < DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap)) {
+			/*
+			 * It guarantees that even if there was some older DTX to be added,
+			 * the epoch difference between it and all former added ones cannot
+			 * exceed vos_agg_gap. So we can easily calculate the local stable
+			 * epoch. Please reference vos_cont_get_local_stable_epoch().
+			 */
+			D_DEBUG(DB_TRACE, "Increase acceptable modification boundary from "
+				DF_X64 " to " DF_X64 " for container " DF_UUID "\n",
+				cont->vc_mod_epoch_bound,
+				DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap), DP_UUID(cont->vc_id));
+			cont->vc_mod_epoch_bound = DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap);
+			if (dth->dth_epoch <= cont->vc_mod_epoch_bound) {
+				now = daos_gettime_coarse();
+				if (now - cont->vc_dtx_reject_ts > 10) {
+					D_WARN("Reject DTX (2) " DF_DTI " with epoch " DF_X64
+					       " vs bound " DF_X64 "\n", DP_DTI(&dth->dth_xid),
+					       dth->dth_epoch, cont->vc_mod_epoch_bound);
+					cont->vc_dtx_reject_ts = now;
+				}
+				return -DER_TX_RESTART;
+			}
+		}
+	}
+
 	rc = lrua_allocx(cont->vc_dtx_array, &idx, dth->dth_epoch, &dae, &dth->dth_local_stub);
 	if (rc != 0) {
 		/* The array is full, need to commit some transactions first */
@@ -1036,6 +1103,7 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth)
 	}
 
 	D_INIT_LIST_HEAD(&dae->dae_link);
+	D_INIT_LIST_HEAD(&dae->dae_order_link);
 	DAE_LID(dae) = idx + DTX_LID_RESERVED;
 	if (dth->dth_solo)
 		DAE_LID(dae) |= DTX_LID_SOLO_FLAG;
@@ -1044,6 +1112,8 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth)
 	DAE_DKEY_HASH(dae) = dth->dth_dkey_hash;
 	DAE_EPOCH(dae) = dth->dth_epoch;
 	DAE_FLAGS(dae) = dth->dth_flags;
+	if (dth->dth_epoch_owner)
+		DAE_FLAGS(dae) |= DTE_EPOCH_SORTED;
 	DAE_VER(dae) = dth->dth_ver;
 
 	if (dth->dth_mbs != NULL) {
@@ -1072,6 +1142,15 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth)
 	if (rc == 0) {
 		dae->dae_start_time = daos_gettime_coarse();
 		d_list_add_tail(&dae->dae_link, &cont->vc_dtx_act_list);
+		if (dth->dth_epoch_owner)
+			d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_sorted_list);
+		else
+			/*
+			 * Add all the others, including non-leader(s), into unsorted list.
+			 * Then even though the leader was evicted for some reason, related
+			 * DTX still can be considered via the new leader on another target.
+			 */
+			d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_unsorted_list);
 		dth->dth_ent = dae;
 	} else {
 		dtx_evict_lid(cont, dae);
@@ -2979,6 +3058,13 @@ vos_dtx_act_reindex(struct vos_container *cont)
 	umem_off_t			 dbd_off = cont_df->cd_dtx_active_head;
 	d_iov_t				 kiov;
 	d_iov_t				 riov;
+	struct vos_dtx_act_ent		*prev = NULL;
+	/* The max epoch for all unsorted DTX entries to be re-indexed. */
+	uint64_t			 max_eph = 0;
+	/* The min epoch which DTX entry is after the max_eph DTX. */
+	uint64_t			 min_eph = 0;
+	/* The largest diff for above pairs 'max_eph - min_eph'. */
+	uint64_t			 diff = 0;
 	uint64_t			 start_time = daos_gettime_coarse();
 	int				 rc = 0;
 	int				 i;
@@ -3068,6 +3154,43 @@ vos_dtx_act_reindex(struct vos_container *cont)
 
 			dae->dae_start_time = start_time;
 			d_list_add_tail(&dae->dae_link, &cont->vc_dtx_act_list);
+			if (DAE_FLAGS(dae) & DTE_EPOCH_SORTED) {
+				d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_sorted_list);
+			} else {
+				/*
+				 * The DXT entries in the active blob may be generated against
+				 * different VOS AGG GAP configurations, or even upgraded from
+				 * old system that did not support VOS AGG GAP logic yet. Link
+				 * them into a reindex list. During the reindex scanning, we
+				 * will find out the pairs with the largest epoch difference.
+				 * Using such difference to estimate the local stable epoch.
+				 *
+				 * NOTE: The min_eph may be not the smallest one in all the DTX
+				 *	 entries to be re-indexed, instead, it is after current
+				 *	 known max_eph, and if max_eph is changed, min_eph will
+				 *	 be reset. So there may be multiple max/min pairs. Each
+				 *	 pairs has own epoch difference (max_eph - min_eph). We
+				 *	 use the largest diff.
+				 *
+				 * This is an O(N) algorithm. N is the count of DTX entries to be
+				 * re-indexed. Please reference vos_cont_get_local_stable_epoch().
+				 */
+				if (prev == NULL || DAE_EPOCH(dae) > DAE_EPOCH(prev)) {
+					if (max_eph < DAE_EPOCH(dae)) {
+						max_eph = DAE_EPOCH(dae);
+						min_eph = 0;
+					}
+				} else {
+					if (min_eph == 0 || min_eph > DAE_EPOCH(dae)) {
+						min_eph = DAE_EPOCH(dae);
+						if (diff < max_eph - min_eph)
+							diff = max_eph - min_eph;
+					}
+				}
+
+				d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_reindex_list);
+			}
+			prev = dae;
 			dbd_count++;
 		}
 
@@ -3085,6 +3208,8 @@ vos_dtx_act_reindex(struct vos_container *cont)
 		dbd_off = dbd->dbd_next;
 	}
 
+	cont->vc_dtx_reindex_eph_diff = diff;
+
 out:
 	return rc > 0 ? 0 : rc;
 }
@@ -3361,8 +3486,10 @@ vos_dtx_attach(struct dtx_handle *dth, bool persistent, bool exist)
 			vos_dtx_cleanup_internal(dth);
 		}
 
-		D_ERROR("Failed to pin DTX entry for "DF_DTI": "DF_RC"\n",
-			DP_DTI(&dth->dth_xid), DP_RC(rc));
+		if (rc != 0)
+			DL_CDEBUG(rc != -DER_TX_RESTART, DLOG_ERR, DB_TRACE, rc,
+				  "Failed to pin DTX entry for "DF_DTI": "DF_RC,
+				  DP_DTI(&dth->dth_xid), DP_RC(rc));
 	}
 
 	return rc;
diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h
index df2e5e44a0c..acd9e685dd7 100644
--- a/src/vos/vos_internal.h
+++ b/src/vos/vos_internal.h
@@ -141,6 +141,12 @@ enum {
 /* Throttle ENOSPACE error message */
 #define VOS_NOSPC_ERROR_INTVL	60	/* seconds */
 
+extern uint32_t vos_agg_gap;
+
+#define VOS_AGG_GAP_MIN		20 /* seconds */
+#define VOS_AGG_GAP_DEF		60
+#define VOS_AGG_GAP_MAX		180
+
 extern unsigned int vos_agg_nvme_thresh;
 extern bool vos_dkey_punch_propagate;
 
@@ -363,6 +369,31 @@ struct vos_container {
 	struct btr_root		vc_dtx_committed_btr;
 	/* The list for active DTXs, roughly ordered in time. */
 	d_list_t		vc_dtx_act_list;
+	/* The list for the active DTX entries with epoch sorted. */
+	d_list_t		vc_dtx_sorted_list;
+	/* The list for the active DTX entries (but not re-indexed) with epoch unsorted. */
+	d_list_t		vc_dtx_unsorted_list;
+	/* The list for the active DTX entries that are re-indexed when open the container. */
+	d_list_t		vc_dtx_reindex_list;
+	/* The largest epoch difference for re-indexed DTX entries max/min pairs. */
+	uint64_t		vc_dtx_reindex_eph_diff;
+	/* The latest calculated local stable epoch. */
+	daos_epoch_t		vc_local_stable_epoch;
+	/*
+	 * The lowest epoch boundary for current acceptable modification. It cannot be lower than
+	 * vc_local_stable_epoch, otherwise, it may break stable epoch semantics. Because current
+	 * target reported local stable epoch may be used as global stable epoch. There is window
+	 * between current target reporting the local stable epoch and related leader setting the
+	 * global stable epoch. If the modification with older epoch arrives during such internal,
+	 * we have to reject it to avoid potential conflict.
+	 *
+	 * On the other hand, it must be higher than EC/VOS aggregation up boundary. Under space
+	 * pressure, the EC/VOS aggregation up boundary may be higher than vc_local_stable_epoch,
+	 * then it will cause vc_mod_epoch_bound > vc_local_stable_epoch.
+	 */
+	daos_epoch_t		vc_mod_epoch_bound;
+	/* Last timestamp when VOS reject DTX because of stale epoch. */
+	uint64_t		vc_dtx_reject_ts;
 	/* The count of committed DTXs. */
 	uint32_t		vc_dtx_committed_count;
 	/** Index for timestamp lookup */
@@ -432,6 +463,8 @@ struct vos_dtx_act_ent {
 	daos_unit_oid_t			*dae_oids;
 	/* The time (hlc) when the DTX entry is created. */
 	uint64_t			 dae_start_time;
+	/* Link into container::vc_dtx_{sorted,unsorted,reindex}_list. */
+	d_list_t			 dae_order_link;
 	/* Link into container::vc_dtx_act_list. */
 	d_list_t			 dae_link;
 	/* Back pointer to the DTX handle. */
diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h
index 79fc7cbc7ae..b13790fe74f 100644
--- a/src/vos/vos_layout.h
+++ b/src/vos/vos_layout.h
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -273,8 +274,13 @@ enum vos_io_stream {
 struct vos_cont_ext_df {
 	/* GC bucket extension */
 	struct vos_gc_bkt_df		ced_gc_bkt;
+	/*
+	 * Any modification involved in current target (container shard) under the global
+	 * stable epoch have already been persistently stored globally.
+	 */
+	uint64_t			ced_global_stable_epoch;
 	/* Reserved for potential new features */
-	uint64_t			ced_paddings[38];
+	uint64_t			ced_paddings[37];
 	/* Reserved for future extension */
 	uint64_t			ced_reserve;
 };

From c986d348509d46fbeafaf2ea46cf408f08f3da8a Mon Sep 17 00:00:00 2001
From: Niu Yawei <yawei.niu@intel.com>
Date: Sun, 19 Jan 2025 02:04:34 +0800
Subject: [PATCH 5/7] DAOS-16914 vos: don't print error message on ITER_EXIT
 (#15675)

Don't print error message on ITER_EXIT in vos_obj_iterate().

Signed-off-by: Niu Yawei <yawei.niu@hpe.com>
---
 src/vos/vos_iterator.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/vos/vos_iterator.c b/src/vos/vos_iterator.c
index 30baae492e5..d038f3297b6 100644
--- a/src/vos/vos_iterator.c
+++ b/src/vos/vos_iterator.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2016-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -785,6 +786,10 @@ vos_iter_cb(vos_iter_cb_t iter_cb, daos_handle_t ih, vos_iter_entry_t *iter_ent,
 	return rc;
 }
 
+/*
+ * ITER_EXIT indicates that the iteration is interrupted (for instance, the iterating ULT is
+ * terminated by shed_req_wait()), we'd return non-zero value to inform caller in such case.
+ */
 #define JUMP_TO_STAGE(rc, next_label, probe_label, abort_label)				\
 	do {										\
 		switch (rc) {								\
@@ -1089,8 +1094,8 @@ vos_iterate_obj(vos_iter_param_t *param, bool recursive, struct vos_iter_anchors
 
 	cont = vos_hdl2cont(param->ip_hdl);
 	if (!vos_pool_is_evictable(cont->vc_pool))
-		return vos_iterate_internal(param, VOS_ITER_OBJ, recursive, false, anchors,
-					    pre_cb, post_cb, arg, dth);
+		return vos_iterate_internal(param, VOS_ITER_OBJ, recursive, false, anchors, pre_cb,
+					    post_cb, arg, dth);
 
 	/* The caller must provide a filter callback and call the oi_bkt_iter_skip() properly */
 	D_ASSERT(param->ip_filter_cb != NULL && param->ip_bkt_iter == NULL);
@@ -1112,7 +1117,8 @@ vos_iterate_obj(vos_iter_param_t *param, bool recursive, struct vos_iter_anchors
 		rc = vos_iterate_internal(param, VOS_ITER_OBJ, recursive, false, anchors,
 					  pre_cb, post_cb, arg, dth);
 		if (rc) {
-			DL_ERROR(rc, "Iterate bucket:%u failed.", i);
+			DL_CDEBUG(rc == ITER_EXIT, DB_TRACE, DLOG_ERR, rc,
+				  "Iterate bucket:%u failed.", i);
 			break;
 		}
 		reset_anchors(VOS_ITER_OBJ, anchors);
@@ -1136,6 +1142,6 @@ vos_iterate(vos_iter_param_t *param, vos_iter_type_t type, bool recursive,
 {
 	D_ASSERT((param->ip_flags & VOS_IT_KEY_TREE) == 0);
 
-	return vos_iterate_internal(param, type, recursive, false, anchors,
-				    pre_cb, post_cb, arg, dth);
+	return vos_iterate_internal(param, type, recursive, false, anchors, pre_cb, post_cb, arg,
+				    dth);
 }

From 4a95784c1011fffb8dc67cff380a519ab28791c4 Mon Sep 17 00:00:00 2001
From: mlawsonca <mlawsonca@users.noreply.github.com>
Date: Sat, 18 Jan 2025 11:01:52 -0800
Subject: [PATCH 6/7] DAOS-16959 ci: Fix post-provisioning bug (#15740)

Fixes a bug that sets the number of result files to 1 if none exist

Signed-off-by: Margaret Lawson <mlawsonca@google.com>
---
 ci/junit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/junit.sh b/ci/junit.sh
index 549922499dd..e0051b819cd 100644
--- a/ci/junit.sh
+++ b/ci/junit.sh
@@ -24,7 +24,7 @@ report_junit() {
     clush -o '-i ci_key' -l root -w "$nodes" --rcopy "$results"
 
     local results_files
-    results_files=("$results".*)
+    results_files=$(find . -maxdepth 1 -name "$results.*")
 
     if [ ${#results_files[@]} -eq 0 ]; then
         echo "No results found to report as JUnit results"

From 0be2d10c2917257b3d1ca6d8af9fdfc0891d9aa2 Mon Sep 17 00:00:00 2001
From: Niu Yawei <yawei.niu@intel.com>
Date: Mon, 20 Jan 2025 10:30:51 +0800
Subject: [PATCH 7/7] DAOS-16640 vos: tune aggregation credits for phase2 pool
 (#15735)

When md-on-ssd phase2 pool runs into space pressure, larger SCAN credits
will be used to reduce yield & reprobe on iterating, larger DEL credits
will be used to drop more punched objects to GC in one batch, so that GC
will likely reclaim more objects when reclaiming a bucket.

Though larger aggregation credits will lower front end I/O performance,
it can greatly reduce page misses for GC when free space/page is tight.

This PR also fixed the NE space pressure check, where the sys reserved
space should not be taken into account.

Signed-off-by: Niu Yawei <yawei.niu@hpe.com>
---
 src/engine/sched.c      | 10 +++-------
 src/vos/vos_aggregate.c | 29 ++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/engine/sched.c b/src/engine/sched.c
index c9c34e09150..3558902e101 100644
--- a/src/engine/sched.c
+++ b/src/engine/sched.c
@@ -771,7 +771,7 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 {
 	struct sched_info	*info = &dx->dx_sched_info;
 	struct vos_pool_space	 vps = { 0 };
-	uint64_t		 scm_left, nvme_left, ne_left, ne_sys;
+	uint64_t                 scm_left, nvme_left, ne_left;
 	struct pressure_ratio	*pr;
 	int			 orig_pressure, rc;
 
@@ -807,12 +807,8 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
 	if (vps.vps_ne_total == 0) {
 		ne_left = UINT64_MAX;
 	} else {
-		D_ASSERT(vps.vps_ne_total < SCM_TOTAL(&vps));
-		ne_sys = SCM_SYS(&vps) * vps.vps_ne_total / SCM_TOTAL(&vps);
-		if (vps.vps_ne_free > ne_sys)
-			ne_left = vps.vps_ne_free - ne_sys;
-		else
-			ne_left = 0;
+		ne_left = vps.vps_ne_free;
+		D_ASSERT(ne_left <= vps.vps_ne_total);
 	}
 
 	if (NVME_TOTAL(&vps) == 0)      /* NVMe not enabled */
diff --git a/src/vos/vos_aggregate.c b/src/vos/vos_aggregate.c
index 63f5603b0e5..af597ffb238 100644
--- a/src/vos/vos_aggregate.c
+++ b/src/vos/vos_aggregate.c
@@ -1,5 +1,6 @@
 /**
  * (C) Copyright 2019-2024 Intel Corporation.
+ * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -177,10 +178,24 @@ struct vos_agg_param {
 };
 
 static inline void
-credits_set(struct vos_agg_credits *vac, bool tight)
+credits_set(struct vos_pool *pool, struct vos_agg_credits *vac, bool tight)
 {
-	vac->vac_creds_scan = tight ? AGG_CREDS_SCAN_TIGHT : AGG_CREDS_SCAN_SLACK;
-	vac->vac_creds_del = tight ? AGG_CREDS_DEL_TIGHT : AGG_CREDS_DEL_SLACK;
+	unsigned int multiplier = 1;
+
+	/*
+	 * When md-on-ssd phase2 pool runs into space pressure, larger SCAN credits will
+	 * be used to reduce yield & reprobe on iterating, larger DEL credits will be used
+	 * to drop more punched objects to GC in one batch, so that GC will likely reclaim
+	 * more objects when reclaiming a bucket.
+	 *
+	 * Though larger aggregation credits will lower front end I/O performance, it can
+	 * greatly reduce page misses for GC when free space/page is tight.
+	 */
+	if (tight && vos_pool_is_evictable(pool))
+		multiplier = 100;
+
+	vac->vac_creds_scan  = (tight ? AGG_CREDS_SCAN_TIGHT : AGG_CREDS_SCAN_SLACK) * multiplier;
+	vac->vac_creds_del   = (tight ? AGG_CREDS_DEL_TIGHT : AGG_CREDS_DEL_SLACK) * multiplier;
 	vac->vac_creds_merge = tight ? AGG_CREDS_MERGE_TIGHT : AGG_CREDS_MERGE_SLACK;
 }
 
@@ -323,7 +338,7 @@ vos_aggregate_yield(struct vos_agg_param *agg_param)
 
 	if (agg_param->ap_yield_func == NULL) {
 		bio_yield(agg_param->ap_umm);
-		credits_set(&agg_param->ap_credits, true);
+		credits_set(cont->vc_pool, &agg_param->ap_credits, true);
 		return false;
 	}
 
@@ -333,7 +348,7 @@ vos_aggregate_yield(struct vos_agg_param *agg_param)
 		return true;
 
 	/* rc == 0: tight mode; rc == 1: slack mode */
-	credits_set(&agg_param->ap_credits, rc == 0);
+	credits_set(cont->vc_pool, &agg_param->ap_credits, rc == 0);
 
 	return false;
 }
@@ -2702,7 +2717,7 @@ vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
 	/* Set aggregation parameters */
 	ad->ad_agg_param.ap_umm = &cont->vc_pool->vp_umm;
 	ad->ad_agg_param.ap_coh = coh;
-	credits_set(&ad->ad_agg_param.ap_credits, true);
+	credits_set(cont->vc_pool, &ad->ad_agg_param.ap_credits, true);
 	ad->ad_agg_param.ap_discard = 0;
 	ad->ad_agg_param.ap_yield_func = yield_func;
 	ad->ad_agg_param.ap_yield_arg = yield_arg;
@@ -2822,7 +2837,7 @@ vos_discard(daos_handle_t coh, daos_unit_oid_t *oidp, daos_epoch_range_t *epr,
 	/* Set aggregation parameters */
 	ad->ad_agg_param.ap_umm = &cont->vc_pool->vp_umm;
 	ad->ad_agg_param.ap_coh = coh;
-	credits_set(&ad->ad_agg_param.ap_credits, true);
+	credits_set(cont->vc_pool, &ad->ad_agg_param.ap_credits, true);
 	ad->ad_agg_param.ap_discard = 1;
 	ad->ad_agg_param.ap_yield_func = yield_func;
 	ad->ad_agg_param.ap_yield_arg = yield_arg;