Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG]An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node. #2296

Open
Benxiaohai001 opened this issue Sep 4, 2024 · 0 comments
Assignees

Comments

@Benxiaohai001
Copy link
Member

Describe the bug

An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node.
version:cnosdb 2.4.2, revision c6a1777
mode: 3m3d

To Reproduce

"""
3+2集群(k8s deploy)
建库:CREATE DATABASE {db_name} WITH replica 3;
启动一个线程,每0.5秒向tskv-0节点写入一条数据,持续一分钟;
在写入开始10秒后,注入chaos-mesh故障。
等待写入线程结束。
测试点
  检查集群状态
  多次比对文本数据与查询数据结果是否一致
"""

import threading
import subprocess
import sys
import json
import time
import os

sys.path.append(os.getcwd())
from tools import modify_yaml
import uuid

namespace = sys.argv[1]
tskv0_ip = sys.argv[2]
tskv1_ip = sys.argv[3]
meta0_ip = sys.argv[4]
meta1_ip = sys.argv[5]
meta2_ip = sys.argv[6]


class ChaosOnWrite:
    def __init__(self, fault_type) -> None:
        self.fault_type = fault_type
        self.expected_result = []
        self.db_name = fault_type.replace("-", "_")
        self.fault_config_path = "./breakdown/" + fault_type.replace("-", "_") + ".yaml"

    def write_data(self):
        count = 1
        while count < 121:
            cmd = f"curl -s -XPOST -u 'root:' 'http://{tskv0_ip}:8902/api/v1/write?db={self.db_name}' -d 'ma,ta=a fa={count}' -w '%{{http_code}}'"
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            http_code = result.stdout.strip()
            if http_code == '200':
                self.expected_result.append({"fa": float(count), "ta": "a"})
            count += 1
            time.sleep(0.5)

    def run_case(self):
        print(f"START core/common/inject_fault_on_write: {self.fault_type}")

        print("STAGE1: create db")
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "drop database if exists {self.db_name}" -w "%{{http_code}}"'
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        http_code = result.stdout.strip()
        assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "create database {self.db_name} with replica 3" -w "%{{http_code}}"'
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        http_code = result.stdout.strip()
        assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"

        print("STAGE2: start writing to db")
        t_write_data = threading.Thread(target=self.write_data)
        t_write_data.start()
        time.sleep(10)

        print(f"STAGE3: inject fault: {self.fault_type}")
        cmd = f"kubectl apply -f {self.fault_config_path}"
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
        assert result.returncode == 0, f"failed start {self.fault_type}, error is {result.stderr}"
        assert "created" in result.stdout

        print("STAGE4: wait for writing to complete")
        t_write_data.join()

        print(f"STAGE5: result check")
        healthy = False
        for _ in range(30):
            if (
                "healthy"
                in subprocess.run(
                    f"curl -G 'http://{tskv0_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
                ).stdout
                and "healthy"
                in subprocess.run(
                    f"curl -G 'http://{tskv1_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
                ).stdout
            ):
                healthy = True
                break
            time.sleep(5)
        if not healthy:
            print(f"case inject_fault_on_write: {self.fault_type} failed, waiting too long for all node to be healthy")
            return
        cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db={self.db_name}" -d "select fa,ta from ma order by fa"'
        for _ in range(100):
            actual_result = json.loads(subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout)
            if actual_result != self.expected_result:
                print(
                    f"case inject_fault_on_write: {self.fault_type} failed, inconsistent results, error result: {actual_result}"
                )
                return
        print(f"case inject_fault_on_write: {self.fault_type} success")

    def run(self):
        modify_yaml(self.fault_config_path, "cnosdb-cluster", namespace)
        old_pod_name = f"name: {self.fault_type}"
        unique_string = str(uuid.uuid4())
        new_pod_name = f"name: {self.fault_type}-{unique_string}"
        modify_yaml(self.fault_config_path, old_pod_name, new_pod_name)
        try:
            self.run_case()
        finally:
            modify_yaml(self.fault_config_path, namespace, "cnosdb-cluster")
            modify_yaml(self.fault_config_path, new_pod_name, old_pod_name)


fault_types = [
    "container-kill",
    "network-loss",
    "disk-full",
    "io-error",
    "io-latency",
    "network-loss-meta",
    "container-kill-meta",
    "duplicated-packet-meta",
    "network-half-loss",
    "network-half-loss-meta",
    "network-latency",
    "network-latency-meta",
    "duplicated-packet",
    "corrupted-packet",
    "corrupted-packet-meta",
]

for fault_type in fault_types:
    test_case = ChaosOnWrite(fault_type)
    test_case.run()

https://github.com/cnosdb/integration_test/blob/main/cases/core/common/inject_fault_on_write.py

Expected behavior

No response

Additional context

No response

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants