Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Commit

Permalink
PR #1808: block mknod(2) only if creating a device
Browse files Browse the repository at this point in the history
  • Loading branch information
reidpr authored Jan 18, 2024
1 parent ce46b16 commit 07f5a45
Show file tree
Hide file tree
Showing 9 changed files with 199 additions and 49 deletions.
137 changes: 95 additions & 42 deletions bin/ch_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ struct bind BINDS_DEFAULT[] = {
{ 0 }
};

/* Special values for seccomp tables. These must be negative to avoid clashing
with real syscall numbers (note zero is often a valid syscal number). */
#define NR_NON -1 // syscall does not exist on architecture
#define NR_END -2 // end of table

/* Architectures that we support for seccomp. Order matches the
corresponding table below.
Expand All @@ -90,7 +95,7 @@ int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64
AUDIT_ARCH_PPC64LE, // PPC
AUDIT_ARCH_S390X, // s390x
AUDIT_ARCH_X86_64, // x86-64
-1 };
NR_END };
#endif

/* System call numbers that we fake with seccomp (by doing nothing and
Expand All @@ -103,8 +108,6 @@ int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64
automatically, so they are compiled from [1, 2, 3]. See also [4] for a more
general reference.
Zero means the syscall does not exist on that architecture.
NOTE: The total number of faked syscalls (i.e., non-zero entries below)
must be somewhat less than 256. I haven’t computed the exact limit. There
will be an assertion failure at runtime if this is exceeded.
Expand All @@ -120,36 +123,38 @@ int FAKE_SYSCALL_NRS[][6] = {
// arm64 arm32 x86 PPC64 s390x x86-64
// ------ ------ ------ ------ ------ ------
{ 91, 185, 185, 184, 185, 126 }, // capset
{ 0, 182, 182, 181, 212, 92 }, // chown
{ 0, 212, 212, 0, 0, 0 }, // chown32
{ NR_NON, 182, 182, 181, 212, 92 }, // chown
{ NR_NON, 212, 212, NR_NON, NR_NON, NR_NON }, // chown32
{ 55, 95, 95, 95, 207, 93 }, // fchown
{ 0, 207, 207, 0, 0, 0 }, // fchown32
{ NR_NON, 207, 207, NR_NON, NR_NON, NR_NON }, // fchown32
{ 54, 325, 298, 289, 291, 260 }, // fchownat
{ 0, 16, 16, 16, 198, 94 }, // lchown
{ 0, 198, 198, 0, 0, 0 }, // lchown32
{ NR_NON, 16, 16, 16, 198, 94 }, // lchown
{ NR_NON, 198, 198, NR_NON, NR_NON, NR_NON }, // lchown32
{ 104, 347, 283, 268, 277, 246 }, // kexec_load
{ 0, 14, 14, 14, 14, 133 }, // mknod
{ 33, 324, 297, 288, 290, 259 }, // mknodat
{ 152, 139, 139, 139, 216, 123 }, // setfsgid
{ 0, 216, 216, 0, 0, 0 }, // setfsgid32
{ NR_NON, 216, 216, NR_NON, NR_NON, NR_NON }, // setfsgid32
{ 151, 138, 138, 138, 215, 122 }, // setfsuid
{ 0, 215, 215, 0, 0, 0 }, // setfsuid32
{ NR_NON, 215, 215, NR_NON, NR_NON, NR_NON }, // setfsuid32
{ 144, 46, 46, 46, 214, 106 }, // setgid
{ 0, 214, 214, 0, 0, 0 }, // setgid32
{ NR_NON, 214, 214, NR_NON, NR_NON, NR_NON }, // setgid32
{ 159, 81, 81, 81, 206, 116 }, // setgroups
{ 0, 206, 206, 0, 0, 0 }, // setgroups32
{ NR_NON, 206, 206, NR_NON, NR_NON, NR_NON }, // setgroups32
{ 143, 71, 71, 71, 204, 114 }, // setregid
{ 0, 204, 204, 0, 0, 0 }, // setregid32
{ NR_NON, 204, 204, NR_NON, NR_NON, NR_NON }, // setregid32
{ 149, 170, 170, 169, 210, 119 }, // setresgid
{ 0, 210, 210, 0, 0, 0 }, // setresgid32
{ NR_NON, 210, 210, NR_NON, NR_NON, NR_NON }, // setresgid32
{ 147, 164, 164, 164, 208, 117 }, // setresuid
{ 0, 208, 208, 0, 0, 0 }, // setresuid32
{ NR_NON, 208, 208, NR_NON, NR_NON, NR_NON }, // setresuid32
{ 145, 70, 70, 70, 203, 113 }, // setreuid
{ 0, 203, 203, 0, 0, 0 }, // setreuid32
{ NR_NON, 203, 203, NR_NON, NR_NON, NR_NON }, // setreuid32
{ 146, 23, 23, 23, 213, 105 }, // setuid
{ 0, 213, 213, 0, 0, 0 }, // setuid32
{ -1 }, // end
{ NR_NON, 213, 213, NR_NON, NR_NON, NR_NON }, // setuid32
{ NR_END }, // end
};
int FAKE_MKNOD_NRS[] =
{ NR_NON, 14, 14, 14, 14, 133 };
int FAKE_MKNODAT_NRS[] =
{ 33, 324, 297, 288, 290, 259 };
#endif


Expand Down Expand Up @@ -557,40 +562,53 @@ void seccomp_install(void)
int arch_ct = sizeof(SECCOMP_ARCHS)/sizeof(SECCOMP_ARCHS[0]) - 1;
int syscall_cts[arch_ct];
struct sock_fprog p = { 0 };
int ii, idx_allow, idx_fake, idx_next_arch;

// Count how many syscalls we are going to fake. We need this to compute
// the right offsets for all the jumps.
for (int ai = 0; SECCOMP_ARCHS[ai] != -1; ai++) {
p.len += 4; // arch test, end-of-arch jump, load arch & syscall nr
int ii, idx_allow, idx_fake, idx_mknod, idx_mknodat, idx_next_arch;
// Lengths of certain instruction groups. These are all obtained manually
// by counting below, violating DRY. We could automate these counts, but it
// seemed like the cost of extra buffers and code to do that would exceed
// that of maintaining the manual counts.
int ct_jump_start = 4; // ld arch & syscall nr, arch test, end-of-arch jump
int ct_mknod_jump = 2; // jump table handling for mknod(2) and mknodat(2)
int ct_mknod = 2; // mknod(2) handling
int ct_mknodat = 6; // mknodat(2) handling

// Count how many syscalls we are going to fake in the standard way. We
// need this to compute the right offsets for all the jumps.
for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) {
p.len += ct_jump_start + ct_mknod_jump;
syscall_cts[ai] = 0;
for (int si = 0; FAKE_SYSCALL_NRS[si][0] != -1; si++) {
bool syscall_p = FAKE_SYSCALL_NRS[si][ai] > 0;
for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) {
bool syscall_p = FAKE_SYSCALL_NRS[si][ai] != NR_NON;
syscall_cts[ai] += syscall_p;
p.len += syscall_p; // syscall jump table entry
}
DEBUG("seccomp: %x: %d", SECCOMP_ARCHS[ai], syscall_cts[ai]);
DEBUG("seccomp: arch %x: found %d syscalls",
SECCOMP_ARCHS[ai], syscall_cts[ai]);
}

// Initialize program buffer.
p.len += 2; // return instructions (allow and fake success)
p.len += ( 1 // return allow
+ 1 // return fake success
+ ct_mknod // mknod(2) handling
+ ct_mknodat); // mknodat(2) handling
DEBUG("seccomp(2) program has %d instructions", p.len);
T_ (p.len <= 258); // avoid jumps > 255
T_ (p.filter = calloc(p.len, sizeof(struct sock_filter)));

// Return call addresses. Allow needs to come first because we’ll jump to
// it for unknown architectures.
idx_allow = p.len - 2;
idx_fake = p.len - 1;
idx_allow = p.len - 2 - ct_mknod - ct_mknodat;
idx_fake = p.len - 1 - ct_mknod - ct_mknodat;
idx_mknod = p.len - ct_mknod - ct_mknodat;
idx_mknodat = p.len - ct_mknodat;

// Build a jump table for each architecture. The gist is: if architecture
// matches, fall through into the jump table, otherwise jump to the next
// architecture (or ALLOW for the last architecture).
ii = 0;
idx_next_arch = -1; // avoid warning on some compilers
for (int ai = 0; SECCOMP_ARCHS[ai] != -1; ai++) {
for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) {
int jump;
idx_next_arch = ii + syscall_cts[ai] + 4;
idx_next_arch = ii + syscall_cts[ai] + ct_jump_start + ct_mknod_jump;
// load arch into accumulator
iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, arch), 0, 0);
Expand All @@ -602,30 +620,65 @@ void seccomp_install(void)
iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, nr), 0, 0);
// jump table of syscalls
for (int si = 0; FAKE_SYSCALL_NRS[si][0] != -1; si++) {
for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) {
int nr = FAKE_SYSCALL_NRS[si][ai];
if (nr > 0) {
if (nr != NR_NON) {
jump = idx_fake - ii - 1;
T_ (jump <= 255);
iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, nr, jump, 0);
}
}
// jump to allow (distance limit of 255 does not apply to JA)
iw(&p, ii, BPF_JMP|BPF_JA, idx_allow - ii - 1, 0, 0);
ii++;
// jump to mknod(2) handling (add even if syscall not implemented to
// make the instruction counts simpler)
jump = idx_mknod - ii - 1;
T_ (jump <= 255);
iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNOD_NRS[ai], jump, 0);
// jump to mknodat(2) handling
jump = idx_mknodat - ii - 1;
T_ (jump <= 255);
iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNODAT_NRS[ai], jump, 0);
// unfiltered syscall, jump to allow (limit of 255 doesn’t apply to JA)
jump = idx_allow - ii - 1;
iw(&p, ii++, BPF_JMP|BPF_JA, jump, 0, 0);
}
T_ (idx_next_arch == idx_allow);

// Returns. (Note that if we wanted a non-zero errno, we’d bitwise-or with
// SECCOMP_RET_ERRNO. But because fake success is errno == 0, we don’t need
// a no-op “| 0”.)
iw(&p, idx_allow, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0);
iw(&p, idx_fake, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0);
T_ (ii == idx_allow);
iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0);
T_ (ii == idx_fake);
iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0);

// mknod(2) handling. This just loads the file mode and jumps to the right
// place in the mknodat(2) handling.
T_ (ii == idx_mknod);
// load mode argument into accumulator
iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, args[1]), 0, 0);
// jump to mode test
iw(&p, ii++, BPF_JMP|BPF_JA, 1, 0, 0);

// mknodat(2) handling.
T_ (ii == idx_mknodat);
// load mode argument into accumulator
iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, args[2]), 0, 0);
// jump to fake return if trying to create a device.
iw(&p, ii++, BPF_ALU|BPF_AND|BPF_K, S_IFMT, 0, 0); // file type only
iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFCHR, 2, 0);
iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFBLK, 1, 0);
// returns
iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0);
iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0);

// Install filter. Use prctl(2) rather than seccomp(2) for slightly greater
// compatibility (Linux 3.5 rather than 3.17) and because there is a glibc
// wrapper.
T_ (ii == p.len); // next instruction now one past the end of the buffer
Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p));
DEBUG("note: see FAQ to disassemble the above")

// Test filter. This will fail if the kernel executes the call (because we
// are not really privileged and the arguments are bogus) or succeed if
Expand Down
11 changes: 6 additions & 5 deletions doc/ch-image.rst
Original file line number Diff line number Diff line change
Expand Up @@ -788,11 +788,12 @@ This mode uses the kernel’s :code:`seccomp(2)` system call filtering to
intercept certain privileged system calls, do absolutely nothing, and return
success to the program.

The quashed system calls are: :code:`capset(2)`; :code:`chown(2)` and friends;
:code:`kexec_load(2)` (used to validate the filter itself); :code:`mknod(2)`
and :code:`mknodat(2)`; and :code:`setuid(2)`, :code:`setgid(2)`, and
:code:`setgroups(2)` along with the other system calls that change user or
group.
Some system calls are quashed regardless of their arguments:
:code:`capset(2)`; :code:`chown(2)` and friends; :code:`kexec_load(2)` (used
to validate the filter itself); ; and :code:`setuid(2)`, :code:`setgid(2)`,
and :code:`setgroups(2)` along with the other system calls that change user or
group. :code:`mknod(2)` and :code:`mknodat(2)` are quashed if they try to
create a device file (e.g., creating FIFOs works normally).

The advantages of this approach is that it’s much simpler, it’s faster, it’s
completely agnostic to libc, and it’s mostly agnostic to distribution. The
Expand Down
42 changes: 42 additions & 0 deletions doc/dev.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1582,6 +1582,48 @@ catch both dot notation and tuples, but not the list of filenames in

What to do in each location should either be obvious or commented.

Debugging :code:`seccomp(2)` BPF
--------------------------------

:code:`ch-run --seccomp -vv` will log the BPF instructions as they are
computed, but it’s all in raw hex and hard to interpret, e.g.::

$ ch-run --seccomp -vv alpine:3.19 -- true
[...]
ch-run[62763]: seccomp: arch c00000b7: found 13 syscalls (ch_core.c:582)
ch-run[62763]: seccomp: arch 40000028: found 27 syscalls (ch_core.c:582)
[...]
ch-run[62763]: seccomp(2) program has 156 instructions (ch_core.c:591)
ch-run[62763]: 0: { op=20 k= 4 jt= 0 jf= 0 } (ch_core.c:423)
ch-run[62763]: 1: { op=15 k=c00000b7 jt= 0 jf= 17 } (ch_core.c:423)
ch-run[62763]: 2: { op=20 k= 0 jt= 0 jf= 0 } (ch_core.c:423)
ch-run[62763]: 3: { op=15 k= 5b jt=145 jf= 0 } (ch_core.c:423)
[...]
ch-run[62763]: 154: { op= 6 k=7fff0000 jt= 0 jf= 0 } (ch_core.c:423)
ch-run[62763]: 155: { op= 6 k= 50000 jt= 0 jf= 0 } (ch_core.c:423)
ch-run[62763]: note: see FAQ to disassemble the above (ch_core.c:676)
ch-run[62763]: executing: true (ch_core.c:538)

You can instead use `seccomp-tools
<https://github.com/david942j/seccomp-tools>`_ to disassemble and pretty-print
the BPF code in a far easier format, e.g.::

$ sudo apt install ruby-dev
$ gem install --user-install seccomp-tools
$ export PATH=~/.gem/ruby/3.1.0/bin:$PATH
$ seccomp-tools dump -c 'ch-run --seccomp alpine:3.19 -- true'
line CODE JT JF K
=================================
0000: 0x20 0x00 0x00 0x00000004 A = arch
0001: 0x15 0x00 0x11 0xc00000b7 if (A != ARCH_AARCH64) goto 0019
0002: 0x20 0x00 0x00 0x00000000 A = sys_number
0003: 0x15 0x91 0x00 0x0000005b if (A == aarch64.capset) goto 0149
[...]
0154: 0x06 0x00 0x00 0x7fff0000 return ALLOW
0155: 0x06 0x00 0x00 0x00050000 return ERRNO(0)

Note that the disassembly is not perfect; e.g. if an architecture is not in
your kernel headers, the system call name is wrong.

.. LocalWords: milestoned gh nv cht Chacon’s scottchacon mis cantfix tmpimg
.. LocalWords: rootfs cbd cae ce bafb bc weirdal yankovic nop cb fbe adb fd
Expand Down
3 changes: 3 additions & 0 deletions examples/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ paraview/cone.nranks.vtk \
paraview/cone.png \
paraview/cone.py \
paraview/cone.serial.vtk \
seccomp/Dockerfile \
seccomp/mknods.c \
seccomp/test.bats \
spack/Dockerfile \
spark/Dockerfile \
spark/slurm.sh
Expand Down
2 changes: 1 addition & 1 deletion examples/paraview/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# ch-test-scope: full
# ch-test-scope: skip #1810
FROM openmpi
WORKDIR /usr/local/src

Expand Down
10 changes: 10 additions & 0 deletions examples/seccomp/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# ch-test-builder-include: ch-image
FROM alpine:3.17
RUN apk add gcc musl-dev strace
RSYNC / /
RUN gcc -std=c11 -Wall -Werror -fmax-errors=1 -o mknods mknods.c
RUN strace ./mknods
RUN ls -lh /_*
RUN test $(ls /_* | wc -l) == 2
RUN test -p /_mknod_fifo
RUN test -p /_mknodat_fifo
28 changes: 28 additions & 0 deletions examples/seccomp/mknods.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* Use mknod(2) and mknodat(2) to create character and block devices (which
should be blocked by the seccomp filters) and FIFOs (which should not.) */

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>

#define DEVNULL makedev(1,3) // character device /dev/null
#define DEVRAM0 makedev(1,0) // block device /dev/ram0
#define Z_(x) if (x) (fprintf(stderr, "failed: %d: %s (%d)\n", \
__LINE__, strerror(errno), errno), \
exit(1))

int main(void)
{
Z_ (mknod("/_mknod_chr", S_IFCHR, DEVNULL));
Z_ (mknod("/_mknod_blk", S_IFBLK, DEVRAM0));
Z_ (mknod("/_mknod_fifo", S_IFIFO, 0));

Z_ (mknodat(AT_FDCWD, "./_mknodat_chr", S_IFCHR, DEVNULL));
Z_ (mknodat(AT_FDCWD, "./_mknodat_blk", S_IFBLK, DEVRAM0));
Z_ (mknodat(AT_FDCWD, "./_mknodat_fifo", S_IFIFO, 0));
}
14 changes: 14 additions & 0 deletions examples/seccomp/test.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CH_TEST_TAG=$ch_test_tag
load "$CHTEST_DIR"/common.bash

setup () {
prerequisites_ok seccomp
}

@test "${ch_tag}/fifos only" {
ch-run "$ch_img" -- sh -c 'ls -lh /_*'
# shellcheck disable=SC2016
ch-run "$ch_img" -- sh -c 'test $(ls /_* | wc -l) == 2'
ch-run "$ch_img" -- test -p /_mknod_fifo
ch-run "$ch_img" -- test -p /_mknodat_fifo
}
1 change: 0 additions & 1 deletion examples/spack/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# ch-test-scope: skip # issue #1779
FROM almalinux:8

# Note: Spack is a bit of an odd duck testing wise. Because it’s a package
Expand Down

0 comments on commit 07f5a45

Please sign in to comment.