Skip to content

Commit

Permalink
DAOS-14536 tests: more unit tests for replay and checkpoint (#13398)
Browse files Browse the repository at this point in the history
1. Add basic test to verify checkpoint works as expected.
2. add tests to verify interrupted replay and checkpoint.
3. Fix a memory leak of checkpoint.
4. fix to return correct error in dav_obj_open_internal()
5. bumped DAOS_POOL_FAIL_MAP_REFRESH_SERIOUSLY to avoid conflict with
DAOS_POOL_UPGRADE_CONT_ABORT

Signed-off-by: Wang Shilong <[email protected]>
Signed-off-by: Sherin T George <[email protected]>
  • Loading branch information
Wang Shilong authored Dec 7, 2023
1 parent d3b7591 commit 51105db
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 16 deletions.
10 changes: 10 additions & 0 deletions src/bio/bio_wal.c
Original file line number Diff line number Diff line change
Expand Up @@ -1672,6 +1672,9 @@ bio_wal_replay(struct bio_meta_context *mc, struct bio_wal_rp_stats *wrs,
uint64_t total_bytes = 0, rpl_entries = 0, total_tx = 0;
uint64_t s_us = 0;

if (DAOS_FAIL_CHECK(DAOS_WAL_NO_REPLAY))
return 0;

D_ALLOC(buf, max_blks * blk_bytes);
if (buf == NULL)
return -DER_NOMEM;
Expand Down Expand Up @@ -1758,6 +1761,13 @@ bio_wal_replay(struct bio_meta_context *mc, struct bio_wal_rp_stats *wrs,
tight_loop = 0;
bio_yield(NULL);
}

/* test need generate enough tx */
if (DAOS_FAIL_CHECK(DAOS_WAL_FAIL_REPLAY) &&
nr_replayed > daos_fail_value_get()) {
rc = -DER_AGAIN;
break;
}
}
out:
if (rc >= 0) {
Expand Down
2 changes: 1 addition & 1 deletion src/common/dav/dav_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume

rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl);
if (rc) {
err = rc;
err = daos_der2errno(rc);
goto out2;
}

Expand Down
34 changes: 32 additions & 2 deletions src/common/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -2075,11 +2075,12 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
d_list_t free_list;
d_list_t waiting_list;
int i;
int rc;
int rc = 0;
int inflight = 0;
int pages_scanned = 0;
int dchunks_copied = 0;
int iovs_used = 0;
int nr_copying_pgs = 0;

if (cache == NULL)
return 0; /* TODO: When SMD is supported outside VOS, this will be an error */
Expand Down Expand Up @@ -2115,6 +2116,7 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
pinfo->pi_waiting = 1;
if (store->stor_ops->so_wal_id_cmp(store, pinfo->pi_last_inflight, chkpt_id) > 0)
chkpt_id = pinfo->pi_last_inflight;
nr_copying_pgs++;
}

do {
Expand Down Expand Up @@ -2157,11 +2159,32 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
pinfo->pi_last_checkpoint = pinfo->pi_last_inflight;
}

/*
* DAV allocator uses valgrind macros to mark certain portions of
* heap as no access for user. Prevent valgrind from reporting
* invalid read while checkpointing these address ranges.
*/
if (DAOS_ON_VALGRIND) {
d_sg_list_t *sgl = &chkpt_data->cd_sg_list;

for (i = 0; i < sgl->sg_nr; i++)
VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(
sgl->sg_iovs[i].iov_buf, sgl->sg_iovs[i].iov_len);
}

rc = store->stor_ops->so_flush_copy(chkpt_data->cd_fh,
&chkpt_data->cd_sg_list);
/** If this fails, it means invalid argument, so assertion here is fine */
D_ASSERT(rc == 0);

if (DAOS_ON_VALGRIND) {
d_sg_list_t *sgl = &chkpt_data->cd_sg_list;

for (i = 0; i < sgl->sg_nr; i++)
VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(
sgl->sg_iovs[i].iov_buf, sgl->sg_iovs[i].iov_len);
}

for (i = 0; i < chkpt_data->cd_nr_pages; i++) {
pinfo = chkpt_data->cd_pages[i];
pinfo->pi_copying = 0;
Expand Down Expand Up @@ -2205,6 +2228,13 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
iovs_used += chkpt_data->cd_sg_list.sg_nr_out;
d_list_add(&chkpt_data->cd_link, &free_list);

if (DAOS_FAIL_CHECK(DAOS_MEM_FAIL_CHECKPOINT) &&
pages_scanned >= nr_copying_pgs / 2) {
d_list_move(&cache->ca_pgs_copying, &cache->ca_pgs_dirty);
rc = -DER_AGAIN;
break;
}

} while (inflight != 0 || !d_list_empty(&cache->ca_pgs_copying));

D_FREE(chkpt_data_all);
Expand All @@ -2216,6 +2246,6 @@ umem_cache_checkpoint(struct umem_store *store, umem_cache_wait_cb_t wait_cb, vo
stats->uccs_nr_iovs = iovs_used;
}

return 0;
return rc;
}
#endif
10 changes: 7 additions & 3 deletions src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -849,16 +849,15 @@ enum {
#define DAOS_POOL_FAIL_MAP_REFRESH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x69)
#define DAOS_CONT_G2L_FAIL (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6a)
#define DAOS_POOL_CREATE_FAIL_STEP_UP (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6b)
#define DAOS_CONT_OP_NOREPLY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6c)
#define DAOS_CONT_OP_NOREPLY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x6c)

/** interoperability failure inject */
#define FLC_SMD_DF_VER (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x70)
#define FLC_POOL_DF_VER (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x71)
#define DAOS_FAIL_LOST_REQ (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x72)
#define DAOS_POOL_UPGRADE_CONT_ABORT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x73)

#define DAOS_POOL_FAIL_MAP_REFRESH_SERIOUSLY \
(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x73)
(DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x74)

#define DAOS_SHARD_OBJ_RW_DROP_REPLY (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x80)
#define DAOS_OBJ_FETCH_DATA_LOST (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x81)
Expand All @@ -883,6 +882,11 @@ enum {
#define DAOS_FORCE_OBJ_UPGRADE (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x9e)
#define DAOS_OBJ_FAIL_NVME_IO (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x9f)

/* WAL && checkpoint failure inject */
#define DAOS_WAL_NO_REPLAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x100)
#define DAOS_WAL_FAIL_REPLAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x101)
#define DAOS_MEM_FAIL_CHECKPOINT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x102)

#define DAOS_DTX_SKIP_PREPARE DAOS_DTX_SPEC_LEADER

#define DAOS_FAIL_CHECK(id) daos_fail_check(id)
Expand Down
8 changes: 8 additions & 0 deletions src/include/gurt/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@
(void)(addr);\
(void)(len);\
} while (0)
#define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(addr, len) do {\
(void)(addr);\
(void)(len);\
} while (0)
#define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(addr, len) do {\
(void)(addr);\
(void)(len);\
} while (0)
#endif

#include <gurt/types.h>
Expand Down
4 changes: 4 additions & 0 deletions src/vos/tests/vts_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ struct io_test_args {
int akey_size;
int dkey_size;
int co_create_step;
bool checkpoint;
bool no_replay;
bool fail_replay;
bool fail_checkpoint;
};

/** test counters */
Expand Down
Loading

0 comments on commit 51105db

Please sign in to comment.