Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16982 csum: recalculate checksum on retrying #15786

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 18 additions & 60 deletions src/client/dfuse/dfuse_core.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.

Check failure on line 2 in src/client/dfuse/dfuse_core.c

View workflow job for this annotation

GitHub Actions / Copyright check

Copyright out of date
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -78,11 +78,11 @@
static int
dfuse_parse_time(char *buff, size_t len, unsigned int *_out)
{
int matched;
unsigned int out = 0;
int count0 = 0;
int count1 = 0;
char c = '\0';
int matched;
unsigned int out = 0;
int count0 = 0;
int count1 = 0;
char c = '\0';

matched = sscanf(buff, "%u%n%c%n", &out, &count0, &c, &count1);

Expand Down Expand Up @@ -536,7 +536,7 @@
err_disconnect:
ret = daos_pool_disconnect(dfp->dfp_poh, NULL);
if (ret)
DFUSE_TRA_WARNING(dfp, "Failed to disconnect pool: "DF_RC, DP_RC(ret));
DFUSE_TRA_WARNING(dfp, "Failed to disconnect pool: " DF_RC, DP_RC(ret));
err_free:
D_FREE(dfp);
err:
Expand Down Expand Up @@ -1377,20 +1377,20 @@
int
dfuse_fs_start(struct dfuse_info *dfuse_info, struct dfuse_cont *dfs)
{
struct fuse_args args = {0};
struct fuse_args args = {0};
struct dfuse_inode_entry *ie;
struct d_slab_reg read_slab = {.sr_init = dfuse_event_init,
.sr_reset = dfuse_read_event_reset,
.sr_release = dfuse_event_release,
POOL_TYPE_INIT(dfuse_event, de_list)};
struct d_slab_reg read_slab = {.sr_init = dfuse_event_init,
.sr_reset = dfuse_read_event_reset,
.sr_release = dfuse_event_release,
POOL_TYPE_INIT(dfuse_event, de_list)};
struct d_slab_reg pre_read_slab = {.sr_init = dfuse_event_init,
.sr_reset = dfuse_pre_read_event_reset,
.sr_release = dfuse_event_release,
POOL_TYPE_INIT(dfuse_event, de_list)};
struct d_slab_reg write_slab = {.sr_init = dfuse_event_init,
.sr_reset = dfuse_write_event_reset,
.sr_release = dfuse_event_release,
POOL_TYPE_INIT(dfuse_event, de_list)};
struct d_slab_reg write_slab = {.sr_init = dfuse_event_init,
.sr_reset = dfuse_write_event_reset,
.sr_release = dfuse_event_release,
POOL_TYPE_INIT(dfuse_event, de_list)};
int rc;
int idx = 0;

Expand Down Expand Up @@ -1580,41 +1580,6 @@
return -DER_SUCCESS;
}

static int
ino_kernel_flush(d_list_t *rlink, void *arg)
{
struct dfuse_info *dfuse_info = arg;
struct dfuse_inode_entry *ie = container_of(rlink, struct dfuse_inode_entry, ie_htl);
int rc;

/* Only evict entries that are direct children of the root, the kernel
* will walk the tree for us
*/
if (ie->ie_parent != 1)
return 0;

/* Do not evict root itself */
if (ie->ie_stat.st_ino == 1)
return 0;

rc = fuse_lowlevel_notify_inval_entry(dfuse_info->di_session, ie->ie_parent, ie->ie_name,
strlen(ie->ie_name));
if (rc != 0 && rc != -EBADF)
DHS_WARN(ie, -rc, "%#lx %#lx " DF_DE, ie->ie_parent, ie->ie_stat.st_ino,
DP_DE(ie->ie_name));
else
DHS_INFO(ie, -rc, "%#lx %#lx " DF_DE, ie->ie_parent, ie->ie_stat.st_ino,
DP_DE(ie->ie_name));

/* If the FUSE connection is dead then do not traverse further, it
* doesn't matter what gets returned here, as long as it's negative
*/
if (rc == -EBADF)
return -DER_NO_HDL;

return -DER_SUCCESS;
}

static int
dfuse_cont_close_cb(d_list_t *rlink, void *handle)
{
Expand All @@ -1638,10 +1603,10 @@
static int
dfuse_pool_close_cb(d_list_t *rlink, void *handle)
{
struct dfuse_info *dfuse_info = handle;
struct dfuse_info *dfuse_info = handle;
struct dfuse_cont_core *dfcc, *dfccn;
struct dfuse_pool *dfp;
int rc;
struct dfuse_pool *dfp;
int rc;

dfp = container_of(rlink, struct dfuse_pool, dfp_entry);

Expand Down Expand Up @@ -1703,13 +1668,6 @@
sem_destroy(&eqt->de_sem);
}

/* First flush, instruct the kernel to forget items. This will run and work in ideal cases
* but often if the filesystem is unmounted it'll abort part-way through.
*/
rc = d_hash_table_traverse(&dfuse_info->dpi_iet, ino_kernel_flush, dfuse_info);

DHL_INFO(dfuse_info, rc, "Kernel flush complete");

/* At this point there's a number of inodes which are in memory, traverse these and free
* them, along with any resources.
* The reference count on inodes match kernel references but the fuse module is disconnected
Expand Down
3 changes: 2 additions & 1 deletion src/object/cli_csum.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2023 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -11,7 +12,7 @@
#include <daos/cont_props.h>
#include "obj_internal.h"

/** How many times to retry UPDATE RPCs on checksum error */
/** How many times to retry FETCH and UPDATE RPCs on checksum error */
#define MAX_CSUM_RETRY 10

int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param,
Expand Down
66 changes: 39 additions & 27 deletions src/object/cli_obj.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -4758,38 +4759,37 @@ obj_comp_cb(tse_task_t *task, void *data)

if (task->dt_result == -DER_CSUM || task->dt_result == -DER_TX_UNCERTAIN ||
task->dt_result == -DER_NVME_IO) {
if (!obj_auxi->spec_shard && !obj_auxi->spec_group &&
!obj_auxi->no_retry && !obj_auxi->ec_wait_recov) {
if (!obj_auxi->spec_shard && !obj_auxi->spec_group && !obj_auxi->no_retry &&
!obj_auxi->ec_wait_recov) {
/* Retry fetch on alternative shard */
if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) {
if (task->dt_result == -DER_CSUM)
if ((obj_auxi->opc == DAOS_OBJ_RPC_FETCH ||
obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) &&
task->dt_result == -DER_CSUM) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought we only want to retry for csum error on wire? but not for NVME fetch CSUM error? No?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's correct, but it would lead to a much large change, and we also want to get a confirmation from the DAOS community to make sure it's fine to move that forward.

Therefore, our strategy here is to land a quick fix while waiting for it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, ok

struct shard_rw_args *rw_arg = &obj_auxi->rw_args;

/* Retry a few times on checksum error; something must
* go terribly wrong if checksum happened for 10 times in a
* row therefore we stop trying anyways even though there
* may exist more replicas. */
if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for fetch, it should be

if (rw_arg->csum_retry_cnt < max(MAX_CSUM_RETRY, obj_get_replicas(obj)) ?

Copy link
Contributor Author

@jxiong jxiong Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's reasonable too. Actually I have realized this so I put a comment above. My argument is that if the checksum error has already happened 10 times probably we should just stop trying, because something must go terribly wrong.

The current logic would be persistent to future change too. If the error is due to network checksum error, we should stop trying anyways if it happened 10 times, no matter how many replicas it has.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, did not notice your comments.

obj_auxi->csum_retry = 1;
else if (task->dt_result == -DER_TX_UNCERTAIN)
rw_arg->csum_retry_cnt++;
D_DEBUG(DB_IO, DF_OID " checksum error, retrying\n",
DP_OID(obj->cob_md.omd_id));
} else {
D_ERROR(DF_OID
" too many retries on checksum error. "
"Failing I/O\n",
DP_OID(obj->cob_md.omd_id));
obj_auxi->io_retry = 0;
}
}

if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) {
if (task->dt_result == -DER_TX_UNCERTAIN)
obj_auxi->tx_uncertain = 1;
else
obj_auxi->nvme_io_err = 1;
} else {
if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE &&
task->dt_result == -DER_CSUM) {
struct shard_rw_args *rw_arg = &obj_auxi->rw_args;

/** Retry a few times on checksum error on update */
if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
obj_auxi->csum_retry = 1;
rw_arg->csum_retry_cnt++;
D_DEBUG(DB_IO, DF_OID" checksum error on "
"update, retrying\n",
DP_OID(obj->cob_md.omd_id));
} else {
D_ERROR(DF_OID" checksum error on update, "
"too many retries. Failing I/O\n",
DP_OID(obj->cob_md.omd_id));
obj_auxi->io_retry = 0;
}
} else if (task->dt_result != -DER_NVME_IO) {
/* Don't retry update for UNCERTAIN errors */
obj_auxi->io_retry = 0;
}
}
} else {
obj_auxi->io_retry = 0;
Expand Down Expand Up @@ -5140,6 +5140,12 @@ obj_csum_update(struct dc_object *obj, daos_obj_update_t *args, struct obj_auxi_
if (!obj_csum_dedup_candidate(&obj->cob_co->dc_props, args->iods, args->nr))
return 0;

if (obj_auxi->csum_retry) {
/* Release old checksum result and prepare for new calculation */
daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum);
daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we probably want to do this after a couple of retries

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's really easy to add but I wonder if that is indeed necessary, because cksum error is a rare event by itself.

How about revising it to:

if (obj_auxi->csum_retry && obj_auxi->csum_retry_cnt > 2) { ... }

would that work for you?

}

return dc_obj_csum_update(obj->cob_co->dc_csummer, obj->cob_co->dc_props,
obj->cob_md.omd_id, args->dkey, args->iods, args->sgls, args->nr,
obj_auxi->reasb_req.orr_singv_los, &obj_auxi->rw_args.dkey_csum,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of the actual issue we saw, it was the dkey_csum that needs to be recalculated, is that happening here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes if I read the code correctly because we release the previous calculation above.

Expand All @@ -5150,6 +5156,12 @@ static int
obj_csum_fetch(const struct dc_object *obj, daos_obj_fetch_t *args,
struct obj_auxi_args *obj_auxi)
{
if (obj_auxi->csum_retry) {
/* Release old checksum result and prepare for new calculation */
daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum);
daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums);
}

return dc_obj_csum_fetch(obj->cob_co->dc_csummer, args->dkey, args->iods, args->sgls,
args->nr, obj_auxi->reasb_req.orr_singv_los,
&obj_auxi->rw_args.dkey_csum, &obj_auxi->rw_args.iod_csums);
Expand Down
14 changes: 4 additions & 10 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1371,11 +1372,8 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
orw->orw_dkey_csum, &orw->orw_iod_array,
&orw->orw_oid);
if (rc != 0) {
D_ERROR(DF_C_UOID_DKEY"verify_keys error: "DF_RC"\n",
DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey),
DP_RC(rc));
if (rc == -DER_CSUM)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (rc == -DER_CSUM)
this needs to be fixed as well?

obj_log_csum_err();
D_ERROR(DF_C_UOID_DKEY "verify_keys error: " DF_RC "\n",
DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey), DP_RC(rc));
return rc;
}

Expand Down Expand Up @@ -4598,12 +4596,8 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
rc = csum_verify_keys(ioc->ioc_coc->sc_csummer,
&dcsr->dcsr_dkey, dcu->dcu_dkey_csum,
&dcu->dcu_iod_array, &dcsr->dcsr_oid);
if (rc != 0) {
if (rc == -DER_CSUM)
obj_log_csum_err();

if (rc != 0)
goto out;
}

if (iohs == NULL) {
D_ALLOC_ARRAY(iohs, dcde->dcde_write_cnt);
Expand Down
28 changes: 23 additions & 5 deletions src/tests/suite/daos_checksum.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -17,6 +18,7 @@
#include <daos/object.h>

#define EC_CSUM_OC OC_EC_2P2G1
#define RP_CSUM_OC OC_RP_2G1

static void
iov_update_fill(d_iov_t *iov, char *data, uint64_t len_to_fill);
Expand Down Expand Up @@ -45,7 +47,14 @@ csum_ec_enable(void **state)
}

static inline int
csum_replia_enable(void **state)
csum_replica_enable(void **state)
{
dts_csum_oc = RP_CSUM_OC;
return 0;
}

static inline int
csum_shard_enable(void **state)
{
dts_csum_oc = OC_SX;
return 0;
Expand Down Expand Up @@ -2781,10 +2790,18 @@ setup(void **state)

#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
#define CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \
csum_replia_enable, test_case_teardown }
#define EC_CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \
csum_ec_enable, test_case_teardown }
#define CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_shard_enable, test_case_teardown \
}
#define EC_CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_ec_enable, test_case_teardown \
}
#define RP_CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_replica_enable, test_case_teardown \
}

static const struct CMUnitTest csum_tests[] = {
CSUM_TEST("DAOS_CSUM00: csum disabled", checksum_disabled),
Expand Down Expand Up @@ -2815,6 +2832,7 @@ static const struct CMUnitTest csum_tests[] = {
CSUM_TEST("DAOS_CSUM07: Mix of Single Value and Array values iods", mix_test),
CSUM_TEST("DAOS_CSUM08: Update/Fetch A Key", test_update_fetch_a_key),
CSUM_TEST("DAOS_CSUM09: Update/Fetch D Key", test_update_fetch_d_key),
RP_CSUM_TEST("DAOS_CSUM09.2: Update/Fetch D Key", test_update_fetch_d_key),
CSUM_TEST("DAOS_CSUM10: Enumerate A Keys", test_enumerate_a_key),
CSUM_TEST("DAOS_CSUM11: Enumerate D Keys", test_enumerate_d_key),
CSUM_TEST("DAOS_CSUM12: Enumerate objects", test_enumerate_object),
Expand Down
Loading