[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

Benxiaohai001 · 2024-09-04T10:54:59Z

Describe the bug

An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node.
version:cnosdb 2.4.2, revision c6a1777
mode: 3m3d

To Reproduce

"""
3+2集群（k8s deploy）
建库：CREATE DATABASE {db_name} WITH replica 3;
启动一个线程，每0.5秒向tskv-0节点写入一条数据，持续一分钟;
在写入开始10秒后，注入chaos-mesh故障。
等待写入线程结束。
测试点
  检查集群状态
  多次比对文本数据与查询数据结果是否一致
"""

import threading
import subprocess
import sys
import json
import time
import os

sys.path.append(os.getcwd())
from tools import modify_yaml
import uuid

namespace = sys.argv[1]
tskv0_ip = sys.argv[2]
tskv1_ip = sys.argv[3]
meta0_ip = sys.argv[4]
meta1_ip = sys.argv[5]
meta2_ip = sys.argv[6]


class ChaosOnWrite:
    def __init__(self, fault_type) -> None:
        self.fault_type = fault_type
        self.expected_result = []
        self.db_name = fault_type.replace("-", "_")
        self.fault_config_path = "./breakdown/" + fault_type.replace("-", "_") + ".yaml"

    def write_data(self):
        count = 1
        while count < 121:
            cmd = f"curl -s -XPOST -u 'root:' 'http://{tskv0_ip}:8902/api/v1/write?db={self.db_name}' -d 'ma,ta=a fa={count}' -w '%{{http_code}}'"
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            http_code = result.stdout.strip()
            if http_code == '200':
                self.expected_result.append({"fa": float(count), "ta": "a"})
            count += 1
            time.sleep(0.5)

    def run_case(self):
        print(f"START core/common/inject_fault_on_write: {self.fault_type}")

        print("STAGE1: create db")
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "drop database if exists {self.db_name}" -w "%{{http_code}}"'
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        http_code = result.stdout.strip()
        assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "create database {self.db_name} with replica 3" -w "%{{http_code}}"'
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        http_code = result.stdout.strip()
        assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"

        print("STAGE2: start writing to db")
        t_write_data = threading.Thread(target=self.write_data)
        t_write_data.start()
        time.sleep(10)

        print(f"STAGE3: inject fault: {self.fault_type}")
        cmd = f"kubectl apply -f {self.fault_config_path}"
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        assert result.returncode == 0, f"failed start {self.fault_type}, error is {result.stderr}"
        assert "created" in result.stdout

        print("STAGE4: wait for writing to complete")
        t_write_data.join()

        print(f"STAGE5: result check")
        healthy = False
        for _ in range(30):
            if (
                "healthy"
                in subprocess.run(
                    f"curl -G 'http://{tskv0_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
                ).stdout
                and "healthy"
                in subprocess.run(
                    f"curl -G 'http://{tskv1_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
                ).stdout
            ):
                healthy = True
                break
            time.sleep(5)
        if not healthy:
            print(f"case inject_fault_on_write: {self.fault_type} failed, waiting too long for all node to be healthy")
            return
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db={self.db_name}" -d "select fa,ta from ma order by fa"'
        for _ in range(100):
            actual_result = json.loads(subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout)
            if actual_result != self.expected_result:
                print(
                    f"case inject_fault_on_write: {self.fault_type} failed, inconsistent results, error result: {actual_result}"
                )
                return
        print(f"case inject_fault_on_write: {self.fault_type} success")

    def run(self):
        modify_yaml(self.fault_config_path, "cnosdb-cluster", namespace)
        old_pod_name = f"name: {self.fault_type}"
        unique_string = str(uuid.uuid4())
        new_pod_name = f"name: {self.fault_type}-{unique_string}"
        modify_yaml(self.fault_config_path, old_pod_name, new_pod_name)
        try:
            self.run_case()
        finally:
            modify_yaml(self.fault_config_path, namespace, "cnosdb-cluster")
            modify_yaml(self.fault_config_path, new_pod_name, old_pod_name)


fault_types = [
    "container-kill",
    "network-loss",
    "disk-full",
    "io-error",
    "io-latency",
    "network-loss-meta",
    "container-kill-meta",
    "duplicated-packet-meta",
    "network-half-loss",
    "network-half-loss-meta",
    "network-latency",
    "network-latency-meta",
    "duplicated-packet",
    "corrupted-packet",
    "corrupted-packet-meta",
]

for fault_type in fault_types:
    test_case = ChaosOnWrite(fault_type)
    test_case.run()

https://github.com/cnosdb/integration_test/blob/main/cases/core/common/inject_fault_on_write.py

Expected behavior

No response

Additional context

No response

The text was updated successfully, but these errors were encountered:

roseboy-liu assigned bartliu827 Nov 7, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

Benxiaohai001 commented Sep 4, 2024

[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

Comments

Benxiaohai001 commented Sep 4, 2024

Describe the bug

To Reproduce

Expected behavior

Additional context