Skip to content

Commit

Permalink
Added cluster lock
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenrong-wang committed Jul 17, 2024
1 parent f1616c9 commit 225136c
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 1 deletion.
48 changes: 48 additions & 0 deletions hpcopr/cluster_general_funcs.c
Original file line number Diff line number Diff line change
Expand Up @@ -4611,6 +4611,54 @@ int volce_bucket_clean(char* workdir, char* crypto_keyfile){
return 0;
}

int create_cluster_lock(char* workdir){
char lock_file[FILENAME_LENGTH]="";
char logdir[DIR_LENGTH]="";
if(create_and_get_subdir(workdir,"log",logdir,DIR_LENGTH)!=0){
return -1;
}
snprintf(lock_file,FILENAME_LENGTH,"%s%slock_timestamp.log",logdir,PATH_SLASH);
FILE* file_p=fopen(lock_file,"w+");
if(file_p==NULL){
return -3;
}
time_t current_time;
time(&current_time);
#ifdef _WIN32
fprintf(file_p,"%lld\n%s\n",current_time,INTERNAL_FILE_HEADER);
#else
fprintf(file_p,"%ld\n%s\n",current_time,INTERNAL_FILE_HEADER);
#endif
fclose(file_p);
return 0;
}

ssize_t check_cluster_lock(char* workdir){
char lock_file[FILENAME_LENGTH]="";
char logdir[DIR_LENGTH]="";
if(create_and_get_subdir(workdir,"log",logdir,DIR_LENGTH)!=0){
return -1;
}
snprintf(lock_file,FILENAME_LENGTH,"%s%slock_timestamp.log",logdir,PATH_SLASH);
FILE* file_p=fopen(lock_file,"r");
if(file_p==NULL){
return 0; /* The file has been deleted. */
}
time_t init_time,current_time;
time(&current_time);
#ifdef _WIN32
fscanf(file_p,"%lld",&init_time);
#else
fscanf(file_p,"%ld",&init_time);
#endif
fclose(file_p);
if((current_time-init_time)>=CLUSTER_LOCK_SECS){
rm_file_or_dir(lock_file);
return 0;
}
return CLUSTER_LOCK_SECS-(current_time-init_time);
}

/* return 1 - running; return 0 - stopped */
/*
int check_volce_ecs_state(char* node_name, char* stackdir){
Expand Down
4 changes: 4 additions & 0 deletions hpcopr/cluster_general_funcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ int valid_region_zone_or_not(char* cluster_name, char* region, char* zone);
int get_default_zone(char* cluster_name, char* region, char* default_zone);
int get_default_nzone(char* cluster_name, char* region, char* default_zone, unsigned int zone_len_max);
int valid_zone_or_not(char* cluster_name, char* zone);

int create_cluster_lock(char* workdir);
ssize_t check_cluster_lock(char* workdir);

/*int check_volce_ecs_state(char* node_name, char* stackdir);
int generate_volce_ecs_state(char* node_name, char* stackdir, int target_state);*/

Expand Down
8 changes: 8 additions & 0 deletions hpcopr/cluster_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,7 @@ int aws_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_local,
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -1642,6 +1643,7 @@ int qcloud_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_loca
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -1932,6 +1934,7 @@ int alicloud_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_lo
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -2251,6 +2254,7 @@ int hwcloud_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_loc
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -2560,6 +2564,7 @@ int baiducloud_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_
remote_copy(workdir,crypto_keyfile,SSHKEY_DIR,filename_temp,"/hpc_data/cluster_data/.bucket_creds/config","root","put","",0);
bceconfig_convert(vaultdir,"delete","","","");
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -2807,6 +2812,7 @@ int azure_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_local
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -3080,6 +3086,7 @@ int gcp_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_local,
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
gcp_credential_convert(workdir,"delete",0);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}

Expand Down Expand Up @@ -3358,5 +3365,6 @@ int volce_cluster_init(char* workdir, char* crypto_keyfile, int batch_flag_local
print_cluster_init_done();
create_local_tf_config(tf_run,stackdir);
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir); /* Create a lock file. */
return 0;
}
26 changes: 26 additions & 0 deletions hpcopr/cluster_operations.c
Original file line number Diff line number Diff line change
Expand Up @@ -1278,6 +1278,7 @@ int add_compute_node(char* workdir, char* crypto_keyfile, char* add_number_strin
}
printf(GENERAL_BOLD "[ -DONE- ]" RESET_DISPLAY " Congrats! The specified compute nodes have been added.\n");
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir);
return 0;
}

Expand All @@ -1301,6 +1302,12 @@ int shutdown_compute_nodes(char* workdir, char* crypto_keyfile, char* param, int
if(strcmp(cloud_flag,"CLOUD_A")!=0&&strcmp(cloud_flag,"CLOUD_B")!=0&&strcmp(cloud_flag,"CLOUD_C")!=0&&strcmp(cloud_flag,"CLOUD_D")!=0&&strcmp(cloud_flag,"CLOUD_E")!=0&&strcmp(cloud_flag,"CLOUD_G")!=0&&strcmp(cloud_flag,"CLOUD_H")!=0){
return -1;
}
ssize_t lock_sec=check_cluster_lock(workdir);
if(lock_sec>0){
printf(FATAL_RED_BOLD "\n[ FATAL: ] The cluster is temporarily locked for this operation.\n");
printf("[ **** ] Please wait for %d seconds." RESET_DISPLAY "\n",(int)lock_sec);
return 1;
}
decrypt_files(workdir,crypto_keyfile);
getstate(workdir,crypto_keyfile);
delete_decrypted_files(workdir,crypto_keyfile);
Expand Down Expand Up @@ -1564,6 +1571,12 @@ int reconfigure_compute_node(char* workdir, char* crypto_keyfile, char* new_conf
printf(WARN_YELLO_BOLD "[ -WARN- ] Currently there is no compute nodes in your cluster." RESET_DISPLAY "\n");
return -3;
}
ssize_t lock_sec=check_cluster_lock(workdir);
if(lock_sec>0){
printf(FATAL_RED_BOLD "\n[ FATAL: ] The cluster is temporarily locked for this operation.\n");
printf("[ **** ] Please wait for %d seconds." RESET_DISPLAY "\n",(int)lock_sec);
return -3;
}
decrypt_files(workdir,crypto_keyfile);
snprintf(filename_temp,FILENAME_LENGTH-1,"%s%shpc_stack_base.tf",stackdir,PATH_SLASH);
snprintf(string_temp,63,"\"%s\"",new_config);
Expand Down Expand Up @@ -1731,6 +1744,12 @@ int reconfigure_master_node(char* workdir, char* crypto_keyfile, char* new_confi
if(get_cloud_flag(workdir,crypto_keyfile,cloud_flag,16)!=0){
return -5;
}
ssize_t lock_sec=check_cluster_lock(workdir);
if(lock_sec>0){
printf(FATAL_RED_BOLD "\n[ FATAL: ] The cluster is temporarily locked for this operation.\n");
printf("[ **** ] Please wait for %d seconds." RESET_DISPLAY "\n",(int)lock_sec);
return 1;
}
decrypt_files(workdir,crypto_keyfile);
snprintf(filename_temp,FILENAME_LENGTH-1,"%s%shpc_stack_base.tf",stackdir,PATH_SLASH);
snprintf(string_temp,63,"\"%s\"",new_config);
Expand Down Expand Up @@ -1888,6 +1907,12 @@ int cluster_sleep(char* workdir, char* crypto_keyfile, tf_exec_config* tf_run){
printf("[ **** ] Command: hpcopr wakeup --all | --min ." RESET_DISPLAY "\n");
return 1;
}
ssize_t lock_sec=check_cluster_lock(workdir);
if(lock_sec>0){
printf(FATAL_RED_BOLD "\n[ FATAL: ] The cluster is temporarily locked for this operation.\n");
printf("[ **** ] Please wait for %d seconds." RESET_DISPLAY "\n",(int)lock_sec);
return 1;
}
decrypt_files(workdir,crypto_keyfile);
getstate(workdir,crypto_keyfile);
compute_node_num=get_compute_node_num(stackdir,crypto_keyfile,"all");
Expand Down Expand Up @@ -2453,6 +2478,7 @@ int rebuild_nodes(char* workdir, char* crypto_keyfile, char* option, int batch_f
printf(WARN_YELLO_BOLD "[ -INFO- ] The rebuild process may need 7 minutes. Please do not operate\n");
printf("[ **** ] this cluster during the period." RESET_DISPLAY "\n");
delete_decrypted_files(workdir,crypto_keyfile);
create_cluster_lock(workdir);
return 0;
}

Expand Down
4 changes: 3 additions & 1 deletion hpcopr/now_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#ifndef NOW_MACROS_H
#define NOW_MACROS_H

#define CORE_VERSION_CODE "0.3.1.0141"
#define CORE_VERSION_CODE "0.3.1.0143"

#define NULL_PTR_ARG -127

Expand All @@ -22,6 +22,8 @@
#define GENERAL_BOLD "\033[1m"
#define RESET_DISPLAY "\033[0m"

#define CLUSTER_LOCK_SECS 420

/* Define the tf configuration */
typedef struct{
char tf_runner_type[16];
Expand Down

0 comments on commit 225136c

Please sign in to comment.