containers · edsantiago · Jul 18, 2024 · Jul 29, 2024 · Jul 30, 2024 · Aug 4, 2024
diff --git a/.cirrus.yml b/.cirrus.yml
@@ -224,6 +224,7 @@ alt_build_task:
     only_if: &no_rhel_release |
         $CIRRUS_BRANCH !=~ 'v[0-9\.]+-rhel' &&
         $CIRRUS_BASE_BRANCH !=~ 'v[0-9\.]+-rhel'
+    skip: $CI == $CI
     env:
         <<: *stdenvars
         TEST_FLAVOR: "altbuild"
@@ -261,6 +262,7 @@ osx_alt_build_task:
     alias: osx_alt_build
     # Docs: ./contrib/cirrus/CIModes.md
     only_if: *no_rhel_release  # RHEL never releases podman mac installer binary
+    skip: $CI == $CI
     persistent_worker: &mac_pw
         labels:
             os: darwin
@@ -441,6 +443,7 @@ win_installer_task:
             CONTAINERS_MACHINE_PROVIDER: 'hyperv'
     alias: win_installer
     only_if: *no_rhel_release
+    skip: $CI == $CI
     depends_on: *build
     ec2_instance: &windows
         image: "${WINDOWS_AMI}"
@@ -627,6 +630,7 @@ local_integration_test_task: &local_integration_test_task
         changesInclude('.cirrus.yml', 'Makefile', 'contrib/cirrus/**', 'vendor/**', 'test/tools/**', 'test/registries*.conf', 'hack/**', 'version/rawversion/*') ||
         changesInclude('test/e2e/**', 'test/utils/**') ||
         (changesInclude('**/*.go', '**/*.c', '**/*.h') && !changesIncludeOnly('test/**', 'pkg/machine/e2e/**'))
+    skip: $CI == $CI
     depends_on: *build
     matrix: *platform_axis
     # integration tests scale well with cpu as they are parallelized
@@ -650,6 +654,7 @@ local_integration_test_task: &local_integration_test_task
 remote_integration_test_task:
     <<: *local_integration_test_task
     alias: remote_integration_test
+    skip: $CI == $CI
     env:
         TEST_FLAVOR: int
         PODBIN_NAME: remote
@@ -662,6 +667,7 @@ container_integration_test_task:
     alias: container_integration_test
     # Docs: ./contrib/cirrus/CIModes.md
     only_if: *only_if_int_test
+    skip: $CI == $CI
     depends_on: *build
     matrix: &fedora_vm_axis
         - env:
@@ -689,6 +695,7 @@ rootless_integration_test_task:
     alias: rootless_integration_test
     # Docs: ./contrib/cirrus/CIModes.md
     only_if: *only_if_int_test
+    skip: $CI == $CI
     depends_on: *build
     matrix: *platform_axis
     gce_instance: *fastvm
@@ -712,6 +719,7 @@ podman_machine_task:
         $CIRRUS_CHANGE_TITLE =~ '.*CI:ALL.*' ||
         changesInclude('.cirrus.yml', 'Makefile', 'contrib/cirrus/**', 'vendor/**', 'test/tools/**', 'test/registries*.conf', 'hack/**', 'version/rawversion/*') ||
         changesInclude('cmd/podman/machine/**', 'pkg/machine/**', '**/*machine*.go')
+    skip: $CI == $CI
     depends_on: *build
     ec2_instance:
         image: "${VM_IMAGE_NAME}"
@@ -734,6 +742,7 @@ podman_machine_aarch64_task:
     name: *std_name_fmt
     alias: podman_machine_aarch64
     only_if: *only_if_machine_test
+    skip: $CI == $CI
     depends_on: *build
     ec2_instance:
         <<: *standard_build_ec2_aarch64
@@ -764,9 +773,10 @@ podman_machine_windows_task:
     # everywhere to do so here it would mean we would need duplicate the
     # full big only_if condition which is more difficult to maintain so
     # use the skip here.
-    skip: &skip_rhel_release |
-        $CIRRUS_BRANCH =~ 'v[0-9\.]+-rhel' ||
-        $CIRRUS_BASE_BRANCH =~ 'v[0-9\.]+-rhel'
+#    skip: &skip_rhel_release |
+#        $CIRRUS_BRANCH =~ 'v[0-9\.]+-rhel' ||
+#        $CIRRUS_BASE_BRANCH =~ 'v[0-9\.]+-rhel'
+    skip: $CI == $CI
     depends_on: *build
     ec2_instance:
         <<: *windows
@@ -792,7 +802,8 @@ podman_machine_mac_task:
     name: *std_name_fmt
     alias: podman_machine_mac
     only_if: *only_if_machine_test
-    skip: *skip_rhel_release
+#    skip: *skip_rhel_release
+    skip: $CI == $CI
     depends_on: *build
     persistent_worker: *mac_pw
     timeout_in: 35m
@@ -967,6 +978,7 @@ buildah_bud_test_task:
         $CIRRUS_CHANGE_TITLE =~ '.*CI:ALL.*' ||
         changesInclude('.cirrus.yml', 'Makefile', 'contrib/cirrus/**', 'vendor/**', 'test/tools/**', 'test/registries*.conf', 'hack/**', 'version/rawversion/*') ||
         changesInclude('**/*build*.go', 'test/buildah-bud/**')
+    skip: $CI == $CI
     depends_on: *build
     env:
         <<: *stdenvars
@@ -996,6 +1008,7 @@ upgrade_test_task:
         changesInclude('.cirrus.yml', 'Makefile', 'contrib/cirrus/**', 'vendor/**', 'test/tools/**', 'test/registries*.conf', 'hack/**', 'version/rawversion/*') ||
         changesInclude('test/upgrade/**', 'test/system/*.bash') ||
         (changesInclude('**/*.go', '**/*.c', '**/*.h') && !changesIncludeOnly('test/**', 'pkg/machine/e2e/**'))
+    skip: $CI == $CI
     depends_on: *build
     matrix:
         - env:

diff --git a/go.mod b/go.mod
@@ -230,3 +230,5 @@ require (
 	gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
 	tags.cncf.io/container-device-interface/specs-go v0.8.0 // indirect
 )
+
+replace github.com/nxadm/tail => github.com/Luap99/tail v0.0.0-20240626140224-ad4e60e8be25
diff --git a/go.sum b/go.sum
@@ -10,6 +10,8 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg6
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
 github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
+github.com/Luap99/tail v0.0.0-20240626140224-ad4e60e8be25 h1:fz7HD7A+DFIBortMJp4kCr0WqU5FXjQHPkXPMTHOsrw=
+github.com/Luap99/tail v0.0.0-20240626140224-ad4e60e8be25/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc=
 github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
 github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
 github.com/Microsoft/hcsshim v0.12.9 h1:2zJy5KA+l0loz1HzEGqyNnjd3fyZA31ZBCGKacp6lLg=
@@ -386,8 +388,6 @@ github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
 github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
-github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=
-github.com/nxadm/tail v1.4.11/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc=
 github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
 github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
 github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg=

diff --git a/hack/bats b/hack/bats
@@ -131,6 +131,9 @@ export PODMAN_ROOTLESS_USER=$(id -un)
 # Make sure to always check for leaks when running locally
 export PODMAN_BATS_LEAK_CHECK=1
 
+# FIXME
+export BATS_LOGDIR=$(mktemp -d --tmpdir podman-bats-logs.XXXXXXXXX)
+
 # Root
 if [[ "$TEST_ROOT" ]]; then
     echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
@@ -140,15 +143,18 @@ if [[ "$TEST_ROOT" ]]; then
             --preserve-env=OCI_RUNTIME \
             --preserve-env=CONTAINERS_HELPER_BINARY_DIR \
             --preserve-env=PODMAN_ROOTLESS_USER \
-            bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
+            --preserve-env=BATS_LOGDIR \
+            bats "${bats_opts[@]}" "${bats_filter[@]}" \
+                 --gather-test-outputs-in "$BATS_LOGDIR/root" $TESTS
     rc=$?
 fi
 
 # Rootless. (Only if we're not already root)
 if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then
     echo "--------------------------------------------------"
     echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
-    bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
+    bats "${bats_opts[@]}" "${bats_filter[@]}" \
+         --gather-test-outputs-in "$BATS_LOGDIR/rootless" $TESTS
     rc=$((rc | $?))
 fi
 

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -863,6 +863,7 @@ json-file | f
     # exactly 10 seconds. Give it some leeway.
     delta_t=$(( $t1 - $t0 ))
     assert "$delta_t" -gt 1 "podman stop: ran too quickly!"
+    # FIXME: can fail under load, take 7 seconds
     assert "$delta_t" -le 6 "podman stop: took too long"
 
     run_podman rm $cname
@@ -890,7 +891,7 @@ EOF
 
 # bats test_tags=ci:parallel
 @test "podman run --hostuser tests" {
-    skip_if_not_rootless "test whether hostuser is successfully added"
+    skip_if_not_rootless "--hostuser is only meaningful when rootless"
     user=$(id -un)
     run_podman 1 run --rm $IMAGE grep $user /etc/passwd
     run_podman run --hostuser=$user --rm $IMAGE grep $user /etc/passwd

diff --git a/test/system/035-logs.bats b/test/system/035-logs.bats
@@ -336,6 +336,14 @@ function _log_test_follow_since() {
     run_podman ${events_backend} run --log-driver=$driver --name $cname -d $IMAGE \
         sh -c "sleep 1; while :; do echo $content && sleep 1; done"
 
+    # FIXME FIXME: TEMPORARY! For debugging a bug I don't even remember any more
+    logpath=
+    if [[ "$driver" = "k8s-file" ]]; then
+        run_podman inspect --format '{{.HostConfig.LogConfig.Path}}' $cname
+        logpath="$output"
+    fi
+    # FIXME FIXME
+
     # sleep is required to make sure the podman event backend no longer sees the start event in the log
     # This value must be greater or equal than the value given in --since below
     sleep 0.2

diff --git a/test/system/080-pause.bats b/test/system/080-pause.bats
@@ -48,6 +48,7 @@ load helpers
 
     # There should be a 3-4 second gap, *maybe* 5. Never 1 or 2, that
     # would imply that the container never paused.
+    # FIXME: under high load, can be 7
     is "$max_delta" "[3456]" "delta t between paused and restarted"
 
     run_podman rm -t 0 -f $cname

diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats
@@ -85,6 +85,12 @@ Log[-1].ExitCode | 0
 Log[-1].Output   | \"Life is Good on stdout\\\nLife is Good on stderr\\\n\"
 " "$current_time" "healthy"
 
+    # FIXME FIXME FIXME: 20240918: there's a race here, wherein _check_health()
+    # can see a "healthy" that comes from before 'touch uh-oh'. One way to
+    # fix that might be to add another arg to _check_health, 'FailingStreak'.
+    # That doesn't show up in podman-events, though, so we'd have to
+    # run podman-inspect in a loop, and that introduces its own races.
+    # I don't have a good answer here. See log.103
     current_time=$(date --iso-8601=ns)
     # Force a failure
     run_podman exec $ctrname touch /uh-oh

diff --git a/test/system/250-systemd.bats b/test/system/250-systemd.bats
@@ -377,6 +377,7 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
     run_podman exec $cname touch /uh-oh
 
     # healthcheck should now fail, with exit status 1 and 'unhealthy' output
+    # FIXME: race: on high load, we can get "Error: no container with ID xxxx"
     run_podman 1 healthcheck run $cname
     is "$output" "unhealthy" "output from 'podman healthcheck run'"
 

diff --git a/test/system/255-auto-update.bats b/test/system/255-auto-update.bats
@@ -18,14 +18,17 @@ function setup() {
 }
 
 function teardown() {
+    ls -l /run/netns | sed -e "s/^/# teardown /" >&3
     if [[ -e $SNAME_FILE ]]; then
         while read line; do
             if [[ "$line" =~ "podman-auto-update" ]]; then
                 echo "Stop timer: $line.timer"
                 systemctl stop $line.timer
                 systemctl disable $line.timer
             else
+                ls -l /run/netns | sed -e "s/^/# before stop $line /" >&3
                 systemctl stop $line
+                ls -l /run/netns | sed -e "s/^/# after stop $line /" >&3
             fi
             rm -f $UNIT_DIR/$line.{service,timer}
         done < $SNAME_FILE
@@ -66,12 +69,12 @@ function generate_service() {
 
     # Unless specified, set a default command.
     if [[ -z "$command" ]]; then
-        command="top -d 120"
+        command="top -d $((100 + BATS_SUITE_TEST_NUMBER))"
     fi
 
     # Container name. Include the autoupdate type, to make debugging easier.
     # IMPORTANT: variable 'cname' is passed (out of scope) up to caller!
-    cname=c_${autoupdate//\'/}_$(random_string)
+    cname="c-$(safename)-${autoupdate//\'/}-$(random_string)"
     target_img="quay.io/libpod/$target_img_basename:latest"
     if [[ -n "$7" ]]; then
         target_img="$7"
@@ -172,7 +175,7 @@ function _confirm_update() {
 
 # This test can fail in dev. environment because of SELinux.
 # quick fix: chcon -t container_runtime_exec_t ./bin/podman
-@test "podman auto-update - label io.containers.autoupdate=image" {
+@test "podman auto-update - label io.containers.autoupdate=imagexxxxxxx" {
     since=$(date --iso-8601=seconds)
     run_podman auto-update
     is "$output" ""
@@ -214,6 +217,11 @@ function _confirm_update() {
     run_podman container inspect --format "{{.ID}}" $ctr_child
     run_podman container inspect --format "{{.State.Status}}" $ctr_child
     is "$output" "running" "child container is in running state"
+
+    ls -l /run/netns | sed -e 's/^/# before container rm /' >&3
+    run_podman container rm -f -t0 $ctr_child
+    run_podman container rm -f -t0 $ctr_parent
+    ls -l /run/netns | sed -e 's/^/# after container rm /' >&3
 }
 
 @test "podman auto-update - label io.containers.autoupdate=image with rollback" {

diff --git a/test/system/260-sdnotify.bats b/test/system/260-sdnotify.bats
@@ -553,8 +553,15 @@ none | false | false | 0
     export NOTIFY_SOCKET=$PODMAN_TMPDIR/notify-$(safename).sock
     _start_socat
 
+    echo "ls -l $PODMAN_TMPDIR/"
+    ls -l $PODMAN_TMPDIR/
+
     run_podman push $registry_flags $IMAGE $image_on_local_registry
 
+    # Again
+    echo "ls -l $PODMAN_TMPDIR/"
+    ls -l $PODMAN_TMPDIR/
+
     run_podman pull $registry_flags $image_on_local_registry
     is "${lines[1]}" "Pulling image //$image_on_local_registry inside systemd: setting pull timeout to 5m0s" "NOTIFY_SOCKET is passed to container"
 

diff --git a/test/system/331-system-check.bats b/test/system/331-system-check.bats
@@ -4,9 +4,21 @@
 # that they are caught and remedied, even if it requires discarding some
 # data in read-write layers.
 #
+# DO NOT PARALLELIZE. All of these tests require complete control of images.
+#
 
 load helpers
 
+function setup_file() {
+    # Pristine setup: no pods, containers, volumes, images
+    run_podman pod rm -a -f
+    run_podman rm -f -a -t0
+    run_podman volume rm -a
+    run_podman image rm -f -a
+
+    _prefetch $IMAGE
+}
+
 @test "podman system check - unmanaged layers" {
     run_podman_testing create-storage-layer
     layerID="$output"

diff --git a/test/system/700-play.bats b/test/system/700-play.bats
@@ -158,7 +158,7 @@ RELABEL="system_u:object_r:container_file_t:s0"
     # Run `play kube` in the background as it will wait for the service
     # container to exit.
     timeout --foreground -v --kill=10 60 \
-        $PODMAN play kube --service-container=true --log-driver journald $TESTYAML &>/dev/null &
+        $PODMAN --syslog play kube --service-container=true --log-driver journald $TESTYAML &>/dev/null &
 
     # Wait for the container to be running
     container_a=$PODCTRNAME
@@ -200,7 +200,7 @@ RELABEL="system_u:object_r:container_file_t:s0"
     is "$output" "true"
 
     # Restart the pod, make sure the service is running again
-    run_podman pod restart $PODNAME
+    run_podman --syslog pod restart $PODNAME
     run_podman container inspect $service_container --format "{{.State.Running}}"
     is "$output" "true"
 
@@ -211,13 +211,13 @@ RELABEL="system_u:object_r:container_file_t:s0"
     is "$output" "Error: container .* is the service container of pod(s) .* and cannot be removed without removing the pod(s)"
 
     # Kill the pod and make sure the service is not running
-    run_podman pod kill $PODNAME
+    run_podman --syslog pod kill $PODNAME
     _ensure_container_running $service_container false
 
     run_podman network ls
 
     # Remove the pod and make sure the service is removed along with it
-    run_podman pod rm $PODNAME
+    run_podman --syslog pod rm $PODNAME
     run_podman 1 container exists $service_container
 }
 
@@ -693,6 +693,7 @@ spec:
     if [[ -n "$PARALLEL_JOBSLOT" ]]; then
         expect=$((expect + 4))
     fi
+    # FIXME: under high load, delta_t can be 12
     assert $delta_t -le $expect \
            "podman kube play did not get killed within $expect seconds"
     # Make sure we actually got SIGTERM and podman printed its message.

diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
@@ -45,6 +45,8 @@ function teardown_suite() {
     stop_registry
     local exit_code=$?
 
+    run_podman '?' rmi $(pause_image)
+
     # At end, if all tests have passed, check for leaks.
     # Don't do this if there were errors: failing tests may not clean up.
     if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then