Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into tanabarr/control-me…
Browse files Browse the repository at this point in the history
…mfilebytes-mode-mdonssd

Signed-off-by: Tom Nabarro <tom.nabarrointel.com>
  • Loading branch information
tanabarr committed Jan 20, 2025
2 parents 7f616fa + 0be2d10 commit 0de7e10
Show file tree
Hide file tree
Showing 28 changed files with 660 additions and 136 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ossf-scorecard.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,6 @@ jobs:
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
with:
sarif_file: results.sarif
2 changes: 1 addition & 1 deletion .github/workflows/trivy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
trivy-config: 'utils/trivy/trivy.yaml'

- name: Upload Trivy scan results to GitHub Security tab
uses: github/codeql-action/upload-sarif@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0
uses: github/codeql-action/upload-sarif@b6a472f63d85b9c78a3ac5e89422239fc15e9b3c # v3.28.1
with:
sarif_file: 'trivy-results.sarif'

Expand Down
2 changes: 1 addition & 1 deletion ci/junit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ report_junit() {
clush -o '-i ci_key' -l root -w "$nodes" --rcopy "$results"

local results_files
results_files=("$results".*)
results_files=$(find . -maxdepth 1 -name "$results.*")

if [ ${#results_files[@]} -eq 0 ]; then
echo "No results found to report as JUnit results"
Expand Down
38 changes: 34 additions & 4 deletions src/container/srv_target.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -321,7 +323,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_PEER_FAIL)))
interval = 0;
else
interval = d_sec2hlc(DAOS_AGG_THRESHOLD);
interval = d_sec2hlc(vos_get_agg_gap());

D_ASSERT(hlc > (interval * 2));
/*
Expand Down Expand Up @@ -409,6 +411,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
tgt_id, epoch_range.epr_lo, epoch_range.epr_hi);

if (!param->ap_vos_agg)
vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi);

flags |= VOS_AGG_FL_FORCE_MERGE;
rc = agg_cb(cont, &epoch_range, flags, param);
if (rc)
Expand All @@ -425,6 +430,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,
DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
tgt_id, epoch_range.epr_lo, epoch_range.epr_hi);

if (!param->ap_vos_agg)
vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi);

if (dss_xstream_is_busy())
flags &= ~VOS_AGG_FL_FORCE_MERGE;
rc = agg_cb(cont, &epoch_range, flags, param);
Expand Down Expand Up @@ -1607,11 +1615,23 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
*/
D_ASSERT(hdl->sch_cont != NULL);
D_ASSERT(hdl->sch_cont->sc_pool != NULL);

hdl->sch_cont->sc_open++;
if (hdl->sch_cont->sc_open > 1) {
/* If there is an inflight open being stuck, then
* let's retry and wait until it finished.
*/
if (hdl->sch_cont->sc_open_initializing) {
hdl->sch_cont->sc_open--;
D_GOTO(err_cont, rc = -DER_AGAIN);
}

if (hdl->sch_cont->sc_open > 1)
goto opened;
/* Only go through if the 1st open succeeds */
if (hdl->sch_cont->sc_props_fetched)
goto opened;
}

hdl->sch_cont->sc_open_initializing = 1;
if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
goto csum_init;

Expand Down Expand Up @@ -1646,6 +1666,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
rc = ds_cont_csummer_init(hdl->sch_cont);
if (rc != 0)
D_GOTO(err_dtx, rc);

hdl->sch_cont->sc_open_initializing = 0;
}
opened:
if (cont_hdl != NULL) {
Expand All @@ -1663,6 +1685,7 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
dtx_cont_close(hdl->sch_cont, true);

err_cont:
hdl->sch_cont->sc_open_initializing = 0;
if (daos_handle_is_valid(poh)) {
int rc_tmp;

Expand Down Expand Up @@ -1750,9 +1773,15 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
D_DEBUG(DB_TRACE, "open pool/cont/hdl "DF_UUID"/"DF_UUID"/"DF_UUID"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid));

retry:
rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0);
if (rc != 0)
if (rc != 0) {
if (rc == -DER_AGAIN) {
dss_sleep(50);
goto retry;
}

/* Once it exclude the target from the pool, since the target
* might still in the cart group, so IV cont open might still
* come to this target, especially if cont open/close will be
Expand All @@ -1762,6 +1791,7 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid),
DP_UUID(cont_hdl_uuid), DP_RC(rc));
}

return rc;
}
Expand Down
2 changes: 2 additions & 0 deletions src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -922,6 +923,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch,
dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0;
dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0;
dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0;
dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0;
dth->dth_aborted = 0;
dth->dth_already = 0;
dth->dth_need_validation = 0;
Expand Down
4 changes: 3 additions & 1 deletion src/dtx/tests/dts_structs.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -70,8 +71,9 @@ struct_dtx_handle(void **state)
SET_BITFIELD_1(dummy, dth_need_validation);
SET_BITFIELD_1(dummy, dth_ignore_uncommitted);
SET_BITFIELD_1(dummy, dth_local);
SET_BITFIELD_1(dummy, dth_epoch_owner);
SET_BITFIELD_1(dummy, dth_local_complete);
SET_BITFIELD(dummy, padding1, 13);
SET_BITFIELD(dummy, padding1, 12);

SET_FIELD(dummy, dth_dti_cos_count);
SET_FIELD(dummy, dth_dti_cos);
Expand Down
36 changes: 16 additions & 20 deletions src/engine/sched.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -197,17 +198,6 @@ enum {

static int sched_policy;

/*
* Time threshold for giving IO up throttling. If space pressure stays in the
* highest level for enough long time, we assume that no more space can be
* reclaimed and choose to give up IO throttling, so that ENOSPACE error could
* be returned to client earlier.
*
* To make time for aggregation reclaiming overwriteen space, this threshold
* should be longer than the DAOS_AGG_THRESHOLD.
*/
#define SCHED_DELAY_THRESH 40000 /* msecs */

struct pressure_ratio {
unsigned int pr_free; /* free space ratio */
unsigned int pr_gc_ratio; /* CPU percentage for GC & Aggregation */
Expand Down Expand Up @@ -781,7 +771,7 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
{
struct sched_info *info = &dx->dx_sched_info;
struct vos_pool_space vps = { 0 };
uint64_t scm_left, nvme_left, ne_left, ne_sys;
uint64_t scm_left, nvme_left, ne_left;
struct pressure_ratio *pr;
int orig_pressure, rc;

Expand Down Expand Up @@ -817,12 +807,8 @@ check_space_pressure(struct dss_xstream *dx, struct sched_pool_info *spi)
if (vps.vps_ne_total == 0) {
ne_left = UINT64_MAX;
} else {
D_ASSERT(vps.vps_ne_total < SCM_TOTAL(&vps));
ne_sys = SCM_SYS(&vps) * vps.vps_ne_total / SCM_TOTAL(&vps);
if (vps.vps_ne_free > ne_sys)
ne_left = vps.vps_ne_free - ne_sys;
else
ne_left = 0;
ne_left = vps.vps_ne_free;
D_ASSERT(ne_left <= vps.vps_ne_total);
}

if (NVME_TOTAL(&vps) == 0) /* NVMe not enabled */
Expand Down Expand Up @@ -943,12 +929,22 @@ is_gc_pending(struct sched_pool_info *spi)
return spi->spi_gc_ults && (spi->spi_gc_ults > spi->spi_gc_sleeping);
}

/* Just run into this space pressure situation recently? */
/*
* Just run into this space pressure situation recently?
*
* If space pressure stays in the highest level for enough long time, we assume
* that no more space can be reclaimed and choose to give up IO throttling, so
* that ENOSPACE error could be returned to client earlier.
*
* To make time for aggregation reclaiming overwriteen space, this threshold
* should be longer than VOS aggregation epoch gap against current HLC.
*/
static inline bool
is_pressure_recent(struct sched_info *info, struct sched_pool_info *spi)
{
D_ASSERT(info->si_cur_ts >= spi->spi_pressure_ts);
return (info->si_cur_ts - spi->spi_pressure_ts) < SCHED_DELAY_THRESH;
return (info->si_cur_ts - spi->spi_pressure_ts) <
(vos_get_agg_gap() + 10) * 1000; /* msecs */
}

static inline uint64_t
Expand Down
12 changes: 1 addition & 11 deletions src/include/daos/dtx.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -27,17 +28,6 @@
/* The time (in second) threshold for batched DTX commit. */
#define DTX_COMMIT_THRESHOLD_AGE 10

/*
* VOS aggregation should try to avoid aggregating in the epoch range where
* lots of data records are pending to commit, so the aggregation epoch upper
* bound is: current HLC - (DTX batched commit threshold + buffer period)
*
* To avoid conflicting of aggregation vs. transactions, any transactional
* update/fetch with epoch lower than the aggregation upper bound should be
* rejected and restarted.
*/
#define DAOS_AGG_THRESHOLD (DTX_COMMIT_THRESHOLD_AGE + 10) /* seconds */

enum dtx_target_flags {
/* The target only contains read-only operations for the DTX. */
DTF_RDONLY = (1 << 0),
Expand Down
20 changes: 6 additions & 14 deletions src/include/daos_srv/container.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2015-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -66,20 +67,11 @@ struct ds_cont_child {
ABT_cond sc_scrub_cond;
ABT_cond sc_rebuild_cond;
ABT_cond sc_fini_cond;
uint32_t sc_dtx_resyncing:1,
sc_dtx_reindex:1,
sc_dtx_reindex_abort:1,
sc_dtx_delay_reset:1,
sc_dtx_registered:1,
sc_props_fetched:1,
sc_stopping:1,
sc_destroying:1,
sc_vos_agg_active:1,
sc_ec_agg_active:1,
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
sc_rw_disabled:1,
sc_scrubbing:1,
sc_rebuilding:1;
uint32_t sc_dtx_resyncing : 1, sc_dtx_reindex : 1, sc_dtx_reindex_abort : 1,
sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_props_fetched : 1, sc_stopping : 1,
sc_destroying : 1, sc_vos_agg_active : 1, sc_ec_agg_active : 1,
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
sc_rw_disabled : 1, sc_scrubbing : 1, sc_rebuilding : 1, sc_open_initializing : 1;
uint32_t sc_dtx_batched_gen;
/* Tracks the schedule request for aggregation ULT */
struct sched_request *sc_agg_req;
Expand Down
7 changes: 6 additions & 1 deletion src/include/daos_srv/dtx_srv.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -113,8 +114,10 @@ struct dtx_handle {
dth_ignore_uncommitted : 1,
/* Local transaction */
dth_local : 1,
/* Locally generate the epoch. */
dth_epoch_owner : 1,
/* Flag to commit the local transaction */
dth_local_complete : 1, padding1 : 13;
dth_local_complete : 1, padding1 : 12;

/* The count the DTXs in the dth_dti_cos array. */
uint32_t dth_dti_cos_count;
Expand Down Expand Up @@ -287,6 +290,8 @@ enum dtx_flags {
DTX_RELAY = (1 << 10),
/** Local transaction */
DTX_LOCAL = (1 << 11),
/** Locally generate the epoch. */
DTX_EPOCH_OWNER = (1 << 12),
};

void
Expand Down
51 changes: 51 additions & 0 deletions src/include/daos_srv/vos.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2015-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -939,6 +940,56 @@ vos_update_renew_epoch(daos_handle_t ioh, struct dtx_handle *dth);
void
vos_dtx_renew_epoch(struct dtx_handle *dth);

/**
* Calculate current locally known stable epoch for the given container.
*
* \param coh [IN] Container open handle
*
* \return The epoch on success, negative value if error.
*/
daos_epoch_t
vos_cont_get_local_stable_epoch(daos_handle_t coh);

/**
* Get global stable epoch for the given container.
*
* \param coh [IN] Container open handle
*
* \return The epoch on success, negative value if error.
*/
daos_epoch_t
vos_cont_get_global_stable_epoch(daos_handle_t coh);

/**
* Set global stable epoch for the given container.
*
* \param coh [IN] Container open handle
* \param epoch [IN] The epoch to be used as the new global stable epoch.
*
* \return Zero on success, negative value if error.
*/
int
vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch);

/**
* Set the lowest allowed modification epoch for the given container.
*
* \param coh [IN] Container open handle
* \param epoch [IN] The lowest allowed epoch for modification.
*
* \return Zero on success, negative value if error.
*/
int
vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch);

/**
* Query the gap between the max allowed aggregation epoch and current HLC.
*
* \return The gap value in seconds.
*/
uint32_t
vos_get_agg_gap(void);

/**
* Get the recx/epoch list.
*
Expand Down
Loading

0 comments on commit 0de7e10

Please sign in to comment.