You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node.
version:cnosdb 2.4.2, revision c6a1777
mode: 3m3d
To Reproduce
"""
3+2集群(k8s deploy)
建库:CREATE DATABASE {db_name} WITH replica 3;
启动一个线程,每0.5秒向tskv-0节点写入一条数据,持续一分钟;
在写入开始10秒后,注入chaos-mesh故障。
等待写入线程结束。
测试点
检查集群状态
多次比对文本数据与查询数据结果是否一致
"""
import threading
import subprocess
import sys
import json
import time
import os
sys.path.append(os.getcwd())
from tools import modify_yaml
import uuid
namespace = sys.argv[1]
tskv0_ip = sys.argv[2]
tskv1_ip = sys.argv[3]
meta0_ip = sys.argv[4]
meta1_ip = sys.argv[5]
meta2_ip = sys.argv[6]
class ChaosOnWrite:
def __init__(self, fault_type) -> None:
self.fault_type = fault_type
self.expected_result = []
self.db_name = fault_type.replace("-", "_")
self.fault_config_path = "./breakdown/" + fault_type.replace("-", "_") + ".yaml"
def write_data(self):
count = 1
while count < 121:
cmd = f"curl -s -XPOST -u 'root:' 'http://{tskv0_ip}:8902/api/v1/write?db={self.db_name}' -d 'ma,ta=a fa={count}' -w '%{{http_code}}'"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
http_code = result.stdout.strip()
if http_code == '200':
self.expected_result.append({"fa": float(count), "ta": "a"})
count += 1
time.sleep(0.5)
def run_case(self):
print(f"START core/common/inject_fault_on_write: {self.fault_type}")
print("STAGE1: create db")
cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "drop database if exists {self.db_name}" -w "%{{http_code}}"'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
http_code = result.stdout.strip()
assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"
cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db=public" -d "create database {self.db_name} with replica 3" -w "%{{http_code}}"'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
http_code = result.stdout.strip()
assert http_code == '200', f"Request failed with HTTP status code:{http_code}, reson:{result.stderr}"
print("STAGE2: start writing to db")
t_write_data = threading.Thread(target=self.write_data)
t_write_data.start()
time.sleep(10)
print(f"STAGE3: inject fault: {self.fault_type}")
cmd = f"kubectl apply -f {self.fault_config_path}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
assert result.returncode == 0, f"failed start {self.fault_type}, error is {result.stderr}"
assert "created" in result.stdout
print("STAGE4: wait for writing to complete")
t_write_data.join()
print(f"STAGE5: result check")
healthy = False
for _ in range(30):
if (
"healthy"
in subprocess.run(
f"curl -G 'http://{tskv0_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
).stdout
and "healthy"
in subprocess.run(
f"curl -G 'http://{tskv1_ip}:8902/api/v1/ping'", shell=True, capture_output=True, text=True
).stdout
):
healthy = True
break
time.sleep(5)
if not healthy:
print(f"case inject_fault_on_write: {self.fault_type} failed, waiting too long for all node to be healthy")
return
cmd = f'curl -u "root:" -H "Accept: application/json" -XPOST "http://{tskv0_ip}:8902/api/v1/sql?db={self.db_name}" -d "select fa,ta from ma order by fa"'
for _ in range(100):
actual_result = json.loads(subprocess.run(cmd, shell=True, capture_output=True, text=True).stdout)
if actual_result != self.expected_result:
print(
f"case inject_fault_on_write: {self.fault_type} failed, inconsistent results, error result: {actual_result}"
)
return
print(f"case inject_fault_on_write: {self.fault_type} success")
def run(self):
modify_yaml(self.fault_config_path, "cnosdb-cluster", namespace)
old_pod_name = f"name: {self.fault_type}"
unique_string = str(uuid.uuid4())
new_pod_name = f"name: {self.fault_type}-{unique_string}"
modify_yaml(self.fault_config_path, old_pod_name, new_pod_name)
try:
self.run_case()
finally:
modify_yaml(self.fault_config_path, namespace, "cnosdb-cluster")
modify_yaml(self.fault_config_path, new_pod_name, old_pod_name)
fault_types = [
"container-kill",
"network-loss",
"disk-full",
"io-error",
"io-latency",
"network-loss-meta",
"container-kill-meta",
"duplicated-packet-meta",
"network-half-loss",
"network-half-loss-meta",
"network-latency",
"network-latency-meta",
"duplicated-packet",
"corrupted-packet",
"corrupted-packet-meta",
]
for fault_type in fault_types:
test_case = ChaosOnWrite(fault_type)
test_case.run()
Describe the bug
An io-error or disk-full failure occurs on the cluster master node, and the cluster cannot switch to the master node.
version:cnosdb 2.4.2, revision c6a1777
mode: 3m3d
To Reproduce
https://github.com/cnosdb/integration_test/blob/main/cases/core/common/inject_fault_on_write.py
Expected behavior
No response
Additional context
No response
The text was updated successfully, but these errors were encountered: