From ae8e43d4a37c11068f77c28cba38050bacae2fd8 Mon Sep 17 00:00:00 2001 From: Zhiguo Wu Date: Mon, 15 Jul 2024 10:40:58 +0800 Subject: [PATCH] BIGTOP-4158: Support service add job retry when failed (#17) --- .../agent/service/CommandServiceGrpcImpl.java | 20 ++++++++ .../agent/service/TaskLogServiceGrpcImpl.java | 22 +-------- .../manager/agent/utils/LogFileUtils.java | 42 ++++++++++++++++ .../server/controller/JobController.java | 7 +++ .../server/enums/ApiExceptionEnum.java | 1 + .../manager/server/enums/LocaleKeys.java | 1 + .../manager/server/service/JobService.java | 2 + .../server/service/impl/JobServiceImpl.java | 41 ++++++++++++++++ .../resources/i18n/messages_en_US.properties | 1 + .../resources/i18n/messages_zh_CN.properties | 1 + bigtop-manager-ui/src/api/job/index.ts | 7 +++ .../src/components/service-add/install.vue | 48 +++++++++++++++++-- bigtop-manager-ui/src/locales/en_US/common.ts | 1 + bigtop-manager-ui/src/locales/zh_CN/common.ts | 1 + 14 files changed, 170 insertions(+), 25 deletions(-) create mode 100644 bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/utils/LogFileUtils.java diff --git a/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/CommandServiceGrpcImpl.java b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/CommandServiceGrpcImpl.java index ed1f829f..c9d93683 100644 --- a/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/CommandServiceGrpcImpl.java +++ b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/CommandServiceGrpcImpl.java @@ -21,6 +21,7 @@ import org.apache.bigtop.manager.agent.cache.Caches; import org.apache.bigtop.manager.agent.executor.CommandExecutor; import org.apache.bigtop.manager.agent.executor.CommandExecutors; +import org.apache.bigtop.manager.agent.utils.LogFileUtils; import org.apache.bigtop.manager.grpc.generated.CommandReply; import org.apache.bigtop.manager.grpc.generated.CommandRequest; import org.apache.bigtop.manager.grpc.generated.CommandServiceGrpc; @@ -32,6 +33,10 @@ import lombok.extern.slf4j.Slf4j; import net.devh.boot.grpc.server.service.GrpcService; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + @Slf4j @GrpcService public class CommandServiceGrpcImpl extends CommandServiceGrpc.CommandServiceImplBase { @@ -39,6 +44,9 @@ public class CommandServiceGrpcImpl extends CommandServiceGrpc.CommandServiceImp @Override public void exec(CommandRequest request, StreamObserver responseObserver) { try { + // Truncate old logs if exists, only useful when it's retry command + truncateLogFile(request.getTaskId()); + MDC.put("taskId", String.valueOf(request.getTaskId())); Caches.RUNNING_TASKS.add(request.getTaskId()); CommandExecutor commandExecutor = CommandExecutors.getCommandExecutor(request.getType()); @@ -54,4 +62,16 @@ public void exec(CommandRequest request, StreamObserver responseOb MDC.clear(); } } + + private void truncateLogFile(Long taskId) { + String filePath = LogFileUtils.getLogFilePath(taskId); + File file = new File(filePath); + if (file.exists()) { + try (RandomAccessFile rf = new RandomAccessFile(file, "rw")) { + rf.setLength(0); + } catch (IOException e) { + log.warn("Error when truncate file: {}", filePath, e); + } + } + } } diff --git a/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/TaskLogServiceGrpcImpl.java b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/TaskLogServiceGrpcImpl.java index 018ad0ad..781cc3fd 100644 --- a/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/TaskLogServiceGrpcImpl.java +++ b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/service/TaskLogServiceGrpcImpl.java @@ -19,18 +19,16 @@ package org.apache.bigtop.manager.agent.service; import org.apache.bigtop.manager.agent.cache.Caches; +import org.apache.bigtop.manager.agent.utils.LogFileUtils; import org.apache.bigtop.manager.grpc.generated.TaskLogReply; import org.apache.bigtop.manager.grpc.generated.TaskLogRequest; import org.apache.bigtop.manager.grpc.generated.TaskLogServiceGrpc; -import org.apache.commons.lang3.SystemUtils; - import io.grpc.Status; import io.grpc.stub.StreamObserver; import lombok.extern.slf4j.Slf4j; import net.devh.boot.grpc.server.service.GrpcService; -import java.io.File; import java.io.RandomAccessFile; import java.nio.charset.StandardCharsets; @@ -40,7 +38,7 @@ public class TaskLogServiceGrpcImpl extends TaskLogServiceGrpc.TaskLogServiceImp @Override public void getLog(TaskLogRequest request, StreamObserver responseObserver) { - String path = getLogFilePath(request.getTaskId()); + String path = LogFileUtils.getLogFilePath(request.getTaskId()); try (RandomAccessFile file = new RandomAccessFile(path, "r")) { // Read from beginning long fileLength = file.length(); @@ -86,20 +84,4 @@ private void readNewLogs(RandomAccessFile file, StreamObserver res } } } - - private String getLogFilePath(Long taskId) { - String baseDir; - if (SystemUtils.IS_OS_WINDOWS) { - baseDir = SystemUtils.getUserDir().getPath(); - } else { - File file = new File(this.getClass() - .getProtectionDomain() - .getCodeSource() - .getLocation() - .getPath()); - baseDir = file.getParentFile().getParentFile().getPath(); - } - - return baseDir + File.separator + "tasklogs" + File.separator + "task-" + taskId + ".log"; - } } diff --git a/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/utils/LogFileUtils.java b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/utils/LogFileUtils.java new file mode 100644 index 00000000..e87929a4 --- /dev/null +++ b/bigtop-manager-agent/src/main/java/org/apache/bigtop/manager/agent/utils/LogFileUtils.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bigtop.manager.agent.utils; + +import org.apache.commons.lang3.SystemUtils; + +import java.io.File; + +public class LogFileUtils { + + public static String getLogFilePath(Long taskId) { + String baseDir; + if (SystemUtils.IS_OS_WINDOWS) { + baseDir = SystemUtils.getUserDir().getPath(); + } else { + File file = new File(LogFileUtils.class + .getProtectionDomain() + .getCodeSource() + .getLocation() + .getPath()); + baseDir = file.getParentFile().getParentFile().getPath(); + } + + return baseDir + File.separator + "tasklogs" + File.separator + "task-" + taskId + ".log"; + } +} diff --git a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/controller/JobController.java b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/controller/JobController.java index 940afd31..97bcc09a 100644 --- a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/controller/JobController.java +++ b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/controller/JobController.java @@ -25,6 +25,7 @@ import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; @@ -66,4 +67,10 @@ public ResponseEntity> list(@PathVariable Long clusterId) { public ResponseEntity get(@PathVariable Long id, @PathVariable Long clusterId) { return ResponseEntity.success(jobService.get(id)); } + + @Operation(summary = "retry", description = "Retry a failed job") + @PostMapping("/{id}/retry") + public ResponseEntity retry(@PathVariable Long id, @PathVariable Long clusterId) { + return ResponseEntity.success(jobService.retry(id)); + } } diff --git a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/ApiExceptionEnum.java b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/ApiExceptionEnum.java index 97c8766a..45ea4235 100644 --- a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/ApiExceptionEnum.java +++ b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/ApiExceptionEnum.java @@ -52,6 +52,7 @@ public enum ApiExceptionEnum { // Job Exceptions -- 16000 ~ 16999 JOB_NOT_FOUND(16000, LocaleKeys.JOB_NOT_FOUND), + JOB_NOT_RETRYABLE(16001, LocaleKeys.JOB_NOT_RETRYABLE), // Configuration Exceptions -- 17000 ~ 17999 CONFIG_NOT_FOUND(17000, LocaleKeys.CONFIG_NOT_FOUND), diff --git a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/LocaleKeys.java b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/LocaleKeys.java index 2a9ad0f0..22dc64ae 100644 --- a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/LocaleKeys.java +++ b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/enums/LocaleKeys.java @@ -50,6 +50,7 @@ public enum LocaleKeys { COMPONENT_NOT_FOUND("component.not.found"), JOB_NOT_FOUND("job.not.found"), + JOB_NOT_RETRYABLE("job.not.retryable"), CONFIG_NOT_FOUND("config.not.found"), diff --git a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/JobService.java b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/JobService.java index 117292fb..5d0e1d84 100644 --- a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/JobService.java +++ b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/JobService.java @@ -26,4 +26,6 @@ public interface JobService { PageVO list(Long clusterId); JobVO get(Long id); + + JobVO retry(Long id); } diff --git a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/impl/JobServiceImpl.java b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/impl/JobServiceImpl.java index b4391f7f..77e191a9 100644 --- a/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/impl/JobServiceImpl.java +++ b/bigtop-manager-server/src/main/java/org/apache/bigtop/manager/server/service/impl/JobServiceImpl.java @@ -18,8 +18,16 @@ */ package org.apache.bigtop.manager.server.service.impl; +import org.apache.bigtop.manager.common.enums.JobState; import org.apache.bigtop.manager.dao.entity.Job; +import org.apache.bigtop.manager.dao.entity.Stage; +import org.apache.bigtop.manager.dao.entity.Task; import org.apache.bigtop.manager.dao.repository.JobRepository; +import org.apache.bigtop.manager.dao.repository.StageRepository; +import org.apache.bigtop.manager.dao.repository.TaskRepository; +import org.apache.bigtop.manager.server.command.scheduler.JobScheduler; +import org.apache.bigtop.manager.server.enums.ApiExceptionEnum; +import org.apache.bigtop.manager.server.exception.ApiException; import org.apache.bigtop.manager.server.model.mapper.JobMapper; import org.apache.bigtop.manager.server.model.query.PageQuery; import org.apache.bigtop.manager.server.model.vo.JobVO; @@ -41,6 +49,15 @@ public class JobServiceImpl implements JobService { @Resource private JobRepository jobRepository; + @Resource + private StageRepository stageRepository; + + @Resource + private TaskRepository taskRepository; + + @Resource + private JobScheduler jobScheduler; + @Override public PageVO list(Long clusterId) { PageQuery pageQuery = PageUtils.getPageQuery(); @@ -60,4 +77,28 @@ public JobVO get(Long id) { Job job = jobRepository.getReferenceById(id); return JobMapper.INSTANCE.fromEntity2VO(job); } + + @Override + public JobVO retry(Long id) { + Job job = jobRepository.getReferenceById(id); + if (job.getState() != JobState.FAILED) { + throw new ApiException(ApiExceptionEnum.JOB_NOT_RETRYABLE); + } + + for (Stage stage : job.getStages()) { + for (Task task : stage.getTasks()) { + task.setState(JobState.PENDING); + taskRepository.save(task); + } + + stage.setState(JobState.PENDING); + stageRepository.save(stage); + } + + job.setState(JobState.PENDING); + jobRepository.save(job); + jobScheduler.submit(job); + + return JobMapper.INSTANCE.fromEntity2VO(job); + } } diff --git a/bigtop-manager-server/src/main/resources/i18n/messages_en_US.properties b/bigtop-manager-server/src/main/resources/i18n/messages_en_US.properties index 6560c05a..5ae40e77 100644 --- a/bigtop-manager-server/src/main/resources/i18n/messages_en_US.properties +++ b/bigtop-manager-server/src/main/resources/i18n/messages_en_US.properties @@ -44,6 +44,7 @@ service.required.not.found=Required Service [{0}] not exist component.not.found=Component not exist job.not.found=Job not exist +job.not.retryable=Job is not retryable when it's not failed config.not.found=Config not exist diff --git a/bigtop-manager-server/src/main/resources/i18n/messages_zh_CN.properties b/bigtop-manager-server/src/main/resources/i18n/messages_zh_CN.properties index 60d9ed7c..a45ff716 100644 --- a/bigtop-manager-server/src/main/resources/i18n/messages_zh_CN.properties +++ b/bigtop-manager-server/src/main/resources/i18n/messages_zh_CN.properties @@ -44,6 +44,7 @@ service.required.not.found=依赖服务 [{0}] 不存在 component.not.found=组件不存在 job.not.found=任务不存在 +job.not.retryable=任务非失败状态,无法重试 config.not.found=配置不存在 diff --git a/bigtop-manager-ui/src/api/job/index.ts b/bigtop-manager-ui/src/api/job/index.ts index 869d7edb..d32ceb37 100644 --- a/bigtop-manager-ui/src/api/job/index.ts +++ b/bigtop-manager-ui/src/api/job/index.ts @@ -27,6 +27,13 @@ export const getJob = (id: number, clusterId: number): Promise => { }) } +export const retryJob = (id: number, clusterId: number): Promise => { + return request({ + method: 'post', + url: '/clusters/' + clusterId + '/jobs/' + id + '/retry' + }) +} + export const getJobs = ( clusterId: number, pagination: Pagination diff --git a/bigtop-manager-ui/src/components/service-add/install.vue b/bigtop-manager-ui/src/components/service-add/install.vue index 99c98494..cfc7cf62 100644 --- a/bigtop-manager-ui/src/components/service-add/install.vue +++ b/bigtop-manager-ui/src/components/service-add/install.vue @@ -19,15 +19,16 @@