Skip to content

Commit

Permalink
In /liveness, verify that Cassandra is still running (#548)
Browse files Browse the repository at this point in the history
* In /liveness, verify that Cassandra is still running (if it was started) by checking the PID file exists and the PID is still running

* If /start is called, check that pid is also running, not just present

* Add issue #552 to Changelog

---------

Co-authored-by: Erik Merkle <[email protected]>
  • Loading branch information
burmanm and emerkle826 authored Oct 23, 2024
1 parent ef1e0dd commit 25819d8
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Changelog for Management API, new PRs should update the `main / unreleased` sect
```

## unreleased
* [ENHANCEMENT] [#552](https://github.com/k8ssandra/management-api-for-apache-cassandra/issues/552) Improve "liveness" probe implementation

## v0.1.87 (2024-10-02)
* [FEATURE] [#535](https://github.com/k8ssandra/management-api-for-apache-cassandra/issues/535) Add Cassandra 5.0.0 to the build matrix
Expand Down
2 changes: 1 addition & 1 deletion cassandra/Dockerfile-3.11.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ RUN if test ! -e apache-cassandra-${CASSANDRA_VERSION}-bin.tar.gz; then curl -L
chown -R cassandra:root ${CASSANDRA_HOME} && \
chmod -R a+rwX ${CASSANDRA_HOME}

FROM cassandra-builder-${TARGETARCH} as cassandra-builder
FROM cassandra-builder-${TARGETARCH} AS cassandra-builder

#############################################################
# Build the Management API
Expand Down
2 changes: 1 addition & 1 deletion cassandra/Dockerfile-4.0.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ RUN if test ! -e apache-cassandra-${CASSANDRA_VERSION}-bin.tar.gz; then curl -L
chown -R cassandra:root ${CASSANDRA_HOME} && \
chmod -R a+rwX ${CASSANDRA_HOME}

FROM cassandra-builder-${TARGETARCH} as cassandra-builder
FROM cassandra-builder-${TARGETARCH} AS cassandra-builder

#############################################################
# Build the Management API
Expand Down
2 changes: 1 addition & 1 deletion cassandra/Dockerfile-4.1.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ RUN if test ! -e apache-cassandra-${CASSANDRA_VERSION}-bin.tar.gz; then curl -L
chown -R cassandra:root ${CASSANDRA_HOME} && \
chmod -R a+rwX ${CASSANDRA_HOME}

FROM cassandra-builder-${TARGETARCH} as cassandra-builder
FROM cassandra-builder-${TARGETARCH} AS cassandra-builder

#############################################################
# Build the Management API
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import com.datastax.mgmtapi.util.ShellUtils;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -42,6 +44,25 @@ private static Optional<File> which(String exeStr) throws IOException {
(exitCode, err) -> Optional.empty());
}

public static final Optional<Integer> findPid(String socketFilePath) throws IOException {
java.nio.file.Path pidPath = Paths.get("/tmp/cassandra.pid");
if (pidPath.toFile().canRead()) {
List<String> lines = Files.readAllLines(pidPath);
if (!lines.isEmpty()) {
return Optional.of(Integer.parseInt(lines.get(0)));
}
}
return UnixCmds.findDbProcessWithMatchingArg("-Ddb.unix_socket_file=" + socketFilePath);
}

public static boolean isPidRunning(int pid) throws IOException {
ProcessBuilder psListPb = new ProcessBuilder(PS_CMD, "-eo", "pid");
return ShellUtils.executeWithHandlers(
psListPb,
(input, err) -> input.anyMatch(x -> x.contains(String.valueOf(pid))),
(exitCode, err) -> false);
}

public static Optional<Integer> findDbProcessWithMatchingArg(String filterStr)
throws IOException {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
*/
package com.datastax.mgmtapi.resources;

import static com.datastax.mgmtapi.ManagementApplication.STATE.STARTED;

import com.datastax.mgmtapi.ManagementApplication;
import com.datastax.mgmtapi.UnixCmds;
import com.datastax.mgmtapi.resources.common.BaseResources;
import com.datastax.mgmtapi.resources.helpers.ResponseTools;
import com.datastax.mgmtapi.resources.models.Job;
Expand All @@ -19,18 +22,24 @@
import io.swagger.v3.oas.annotations.media.ExampleObject;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.responses.ApiResponse;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Path("/api/v0")
public class K8OperatorResources extends BaseResources {
private static final Logger logger = LoggerFactory.getLogger(K8OperatorResources.class);

private static final ObjectMapper jsonMapper = new ObjectMapper();

public K8OperatorResources(ManagementApplication application) {
Expand All @@ -50,6 +59,19 @@ public K8OperatorResources(ManagementApplication application) {
schema = @Schema(implementation = String.class),
examples = @ExampleObject(value = "OK")))
public Response checkLiveness() {
if (app.getRequestedState() == STARTED) {
// Verify it is still running
try {
Optional<Integer> pid = UnixCmds.findPid(app.dbUnixSocketFile.getAbsolutePath());
if (pid.isPresent() && UnixCmds.isPidRunning(pid.get())) {
return Response.ok("OK").build();
}
} catch (IOException e) {
// NOOP
logger.error("Unable to read the pid file, " + e.getMessage(), e);
}
return Response.serverError().build();
}
return Response.ok("OK").build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ public LifecycleResources(ManagementApplication app) {
examples = @ExampleObject(value = "error message")))
public synchronized Response startNode(
@QueryParam("profile") String profile, @QueryParam("replace_ip") String replaceIp) {
app.setRequestedState(STARTED);

// Todo we should add a CALL getPid command and compare;
boolean canConnect;
Expand All @@ -125,7 +124,7 @@ public synchronized Response startNode(
try {
Optional<Integer> maybePid = findPid();

if (maybePid.isPresent()) {
if (maybePid.isPresent() && UnixCmds.isPidRunning(maybePid.get())) {
return Response.status(canConnect ? HttpStatus.SC_ACCEPTED : HttpStatus.SC_NO_CONTENT)
.build();
} else if (canConnect) return Response.status(HttpStatus.SC_PARTIAL_CONTENT).build();
Expand Down Expand Up @@ -195,6 +194,8 @@ public synchronized Response startNode(
// DSE and HCD need the extra "cassandra" startup argument
dbCmdPb.command().add("cassandra");
}
dbCmdPb.command().add("-p");
dbCmdPb.command().add("/tmp/cassandra.pid");
dbCmdPb.command().add("-R");
dbCmdPb.command().add("-Dcassandra.server_process");
dbCmdPb.command().add("-Dcassandra.skip_default_role_setup=true");
Expand All @@ -220,8 +221,12 @@ public synchronized Response startNode(
environment);
}

if (started) logger.info("Started {}", getServerTypeName());
else logger.warn("Error starting {}", getServerTypeName());
if (started) {
logger.info("Started {}", getServerTypeName());
app.setRequestedState(STARTED);
} else {
logger.warn("Error starting {}", getServerTypeName());
}

return Response.status(started ? HttpStatus.SC_CREATED : HttpStatus.SC_METHOD_FAILURE)
.entity(started ? "OK\n" : String.format("Error starting %s", getServerTypeName()))
Expand Down Expand Up @@ -278,7 +283,7 @@ public synchronized Response stopNode() {
} while (tries-- > 0);

Optional<Integer> maybePid = findPid();
if (maybePid.isPresent()) {
if (maybePid.isPresent() && UnixCmds.isPidRunning(maybePid.get())) {
Boolean stopped = UnixCmds.killProcess(maybePid.get());

if (!stopped) {
Expand All @@ -291,7 +296,7 @@ public synchronized Response stopNode() {
Uninterruptibles.sleepUninterruptibly(sleepSeconds, TimeUnit.SECONDS);

maybePid = findPid();
if (maybePid.isPresent()) {
if (maybePid.isPresent() && UnixCmds.isPidRunning(maybePid.get())) {
logger.info("Cassandra is not able to die");
return Response.serverError()
.entity(Entity.text(String.format("Killing %s Failed", getServerTypeName())))
Expand Down Expand Up @@ -520,10 +525,8 @@ public Response getPID() {
}

private Optional<Integer> findPid() throws IOException {
return UnixCmds.findDbProcessWithMatchingArg(
"-Ddb.unix_socket_file=" + app.dbUnixSocketFile.getAbsolutePath());
return UnixCmds.findPid(app.dbUnixSocketFile.getAbsolutePath());
}

/**
* Verifies that the provided log directory can be written. Will attempt to create the directory
* if it does not exist.
Expand Down

0 comments on commit 25819d8

Please sign in to comment.