| From d262fb5809de27d51e9d6b97c6b114804c2b95c5 Mon Sep 17 00:00:00 2001 |
| From: Lennart Poettering <lennart@poettering.net> |
| Date: Tue, 1 Nov 2016 20:25:19 -0600 |
| Subject: [PATCH] core: add new RestrictNamespaces= unit file setting |
| |
| This new setting permits restricting whether namespaces may be created and |
| managed by processes started by a unit. It installs a seccomp filter blocking |
| certain invocations of unshare(), clone() and setns(). |
| |
| RestrictNamespaces=no is the default, and does not restrict namespaces in any |
| way. RestrictNamespaces=yes takes away the ability to create or manage any kind |
| of namspace. "RestrictNamespaces=mnt ipc" restricts the creation of namespaces |
| so that only mount and IPC namespaces may be created/managed, but no other |
| kind of namespaces. |
| |
| This setting should be improve security quite a bit as in particular user |
| namespacing was a major source of CVEs in the kernel in the past, and is |
| accessible to unprivileged processes. With this setting the entire attack |
| surface may be removed for system services that do not make use of namespaces. |
| |
| (cherry picked from commit add005357d535681c7075ced8eec2b6e61b43728) |
| --- |
| Makefile.am | 4 +- |
| TODO | 6 -- |
| man/systemd.exec.xml | 50 +++++++++----- |
| src/core/dbus-execute.c | 21 ++++++ |
| src/core/execute.c | 30 ++++++++ |
| src/core/execute.h | 9 +++ |
| src/core/load-fragment-gperf.gperf.m4 | 2 + |
| src/core/load-fragment.c | 49 +++++++++++++ |
| src/core/load-fragment.h | 1 + |
| src/shared/bus-unit-util.c | 25 +++++++ |
| src/shared/nsflags.c | 126 ++++++++++++++++++++++++++++++++++ |
| src/shared/nsflags.h | 49 +++++++++++++ |
| src/shared/seccomp-util.c | 89 ++++++++++++++++++++++++ |
| src/shared/seccomp-util.h | 2 + |
| src/test/test-seccomp.c | 94 +++++++++++++++++++++++++ |
| 15 files changed, 534 insertions(+), 23 deletions(-) |
| create mode 100644 src/shared/nsflags.c |
| create mode 100644 src/shared/nsflags.h |
| |
| diff --git a/Makefile.am b/Makefile.am |
| index f2d8bf57f7..1031e797b3 100644 |
| --- a/Makefile.am |
| +++ b/Makefile.am |
| @@ -1046,7 +1046,9 @@ libshared_la_SOURCES = \ |
| src/shared/tests.h \ |
| src/shared/tests.c \ |
| src/shared/fdset.c \ |
| - src/shared/fdset.h |
| + src/shared/fdset.h \ |
| + src/shared/nsflags.h \ |
| + src/shared/nsflags.c |
| |
| if HAVE_UTMP |
| libshared_la_SOURCES += \ |
| diff --git a/TODO b/TODO |
| index c8266a549d..164e33708e 100644 |
| --- a/TODO |
| +++ b/TODO |
| @@ -59,14 +59,10 @@ Features: |
| |
| * define gpt header bits to select volatility mode |
| |
| -* nspawn: mount loopback filesystems with "discard" |
| - |
| * ProtectKernelLogs= (drops CAP_SYSLOG, add seccomp for syslog() syscall, and DeviceAllow to /dev/kmsg) in service files |
| |
| * ProtectClock= (drops CAP_SYS_TIMES, adds seecomp filters for settimeofday, adjtimex), sets DeviceAllow o /dev/rtc |
| |
| -* ProtectKernelModules= (drops CAP_SYS_MODULE and filters the kmod syscalls) |
| - |
| * ProtectTracing= (drops CAP_SYS_PTRACE, blocks ptrace syscall, makes /sys/kernel/tracing go away) |
| |
| * ProtectMount= (drop mount/umount/pivot_root from seccomp, disallow fuse via DeviceAllow, imply Mountflags=slave) |
| @@ -88,8 +84,6 @@ Features: |
| |
| * Add RootImage= for mounting a disk image or file as root directory |
| |
| -* RestrictNamespaces= or so in services (taking away the ability to create namespaces, with setns, unshare, clone) |
| - |
| * make sure the ratelimit object can deal with USEC_INFINITY as way to turn off things |
| |
| * journalctl: make sure -f ends when the container indicated by -M terminates |
| diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml |
| index 3c350df11f..33bca1bfb0 100644 |
| --- a/man/systemd.exec.xml |
| +++ b/man/systemd.exec.xml |
| @@ -1234,22 +1234,16 @@ |
| <varlistentry> |
| <term><varname>NoNewPrivileges=</varname></term> |
| |
| - <listitem><para>Takes a boolean argument. If true, ensures that the service |
| - process and all its children can never gain new privileges. This option is more |
| - powerful than the respective secure bits flags (see above), as it also prohibits |
| - UID changes of any kind. This is the simplest and most effective way to ensure that |
| - a process and its children can never elevate privileges again. Defaults to false, |
| - but in the user manager instance certain settings force |
| - <varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting. |
| - Those is the case when <varname>SystemCallFilter=</varname>, |
| - <varname>SystemCallArchitectures=</varname>, |
| - <varname>RestrictAddressFamilies=</varname>, |
| - <varname>PrivateDevices=</varname>, |
| - <varname>ProtectKernelTunables=</varname>, |
| - <varname>ProtectKernelModules=</varname>, |
| - <varname>MemoryDenyWriteExecute=</varname>, or |
| - <varname>RestrictRealtime=</varname> are specified. |
| - </para></listitem> |
| + <listitem><para>Takes a boolean argument. If true, ensures that the service process and all its children can |
| + never gain new privileges through <function>execve()</function> (e.g. via setuid or setgid bits, or filesystem |
| + capabilities). This is the simplest and most effective way to ensure that a process and its children can never |
| + elevate privileges again. Defaults to false, but in the user manager instance certain settings force |
| + <varname>NoNewPrivileges=yes</varname>, ignoring the value of this setting. This is the case when |
| + <varname>SystemCallFilter=</varname>, <varname>SystemCallArchitectures=</varname>, |
| + <varname>RestrictAddressFamilies=</varname>, <varname>RestrictNamespaces=</varname>, |
| + <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>, |
| + <varname>ProtectKernelModules=</varname>, <varname>MemoryDenyWriteExecute=</varname>, or |
| + <varname>RestrictRealtime=</varname> are specified.</para></listitem> |
| </varlistentry> |
| |
| <varlistentry> |
| @@ -1462,6 +1456,30 @@ |
| logging. This does not affect commands prefixed with <literal>+</literal>.</para></listitem> |
| </varlistentry> |
| |
| + <varlistentry> |
| + <term><varname>RestrictNamespaces=</varname></term> |
| + |
| + <listitem><para>Restricts access to Linux namespace functionality for the processes of this unit. For details |
| + about Linux namespaces, see |
| + <citerefentry><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>. Either takes a |
| + boolean argument, or a space-separated list of namespace type identifiers. If false (the default), no |
| + restrictions on namespace creation and switching are made. If true, access to any kind of namespacing is |
| + prohibited. Otherwise, a space-separated list of namespace type identifiers must be specified, consisting of |
| + any combination of: <constant>cgroup</constant>, <constant>ipc</constant>, <constant>net</constant>, |
| + <constant>mnt</constant>, <constant>pid</constant>, <constant>user</constant> and <constant>uts</constant>. Any |
| + namespace type listed is made accessible to the unit's processes, access to namespace types not listed is |
| + prohibited (whitelisting). By prepending the list with a single tilda character (<literal>~</literal>) the |
| + effect may be inverted: only the listed namespace types will be made inaccessible, all unlisted ones are |
| + permitted (blacklisting). If the empty string is assigned, the default namespace restrictions are applied, |
| + which is equivalent to false. Internally, this setting limits access to the |
| + <citerefentry><refentrytitle>unshare</refentrytitle><manvolnum>2</manvolnum></citerefentry>, |
| + <citerefentry><refentrytitle>clone</refentrytitle><manvolnum>2</manvolnum></citerefentry> and |
| + <citerefentry><refentrytitle>setns</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls, taking |
| + the specified flags parameters into account. Note that — if this option is used — in addition to restricting |
| + creation and switching of the specified types of namespaces (or all of them, if true) access to the |
| + <function>setns()</function> system call with a zero flags parameter is prohibited.</para></listitem> |
| + </varlistentry> |
| + |
| <varlistentry> |
| <term><varname>ProtectKernelModules=</varname></term> |
| |
| diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c |
| index 03f23780c1..d7bb0496a0 100644 |
| --- a/src/core/dbus-execute.c |
| +++ b/src/core/dbus-execute.c |
| @@ -781,6 +781,7 @@ const sd_bus_vtable bus_exec_vtable[] = { |
| SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST), |
| SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST), |
| SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST), |
| + SD_BUS_PROPERTY("RestrictNamespace", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST), |
| SD_BUS_VTABLE_END |
| }; |
| |
| @@ -1591,7 +1592,27 @@ int bus_exec_context_set_transient_property( |
| } |
| |
| return 1; |
| + } else if (streq(name, "RestrictNamespaces")) { |
| + uint64_t flags; |
| |
| + r = sd_bus_message_read(message, "t", &flags); |
| + if (r < 0) |
| + return r; |
| + if ((flags & NAMESPACE_FLAGS_ALL) != flags) |
| + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown namespace types"); |
| + |
| + if (mode != UNIT_CHECK) { |
| + _cleanup_free_ char *s = NULL; |
| + |
| + r = namespace_flag_to_string_many(flags, &s); |
| + if (r < 0) |
| + return r; |
| + |
| + c->restrict_namespaces = flags; |
| + unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, s); |
| + } |
| + |
| + return 1; |
| } |
| |
| ri = rlimit_from_string(name); |
| diff --git a/src/core/execute.c b/src/core/execute.c |
| index 224382b581..59ce0774c4 100644 |
| --- a/src/core/execute.c |
| +++ b/src/core/execute.c |
| @@ -1534,6 +1534,18 @@ static int apply_private_devices(const Unit *u, const ExecContext *c) { |
| return seccomp_load_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); |
| } |
| |
| +static int apply_restrict_namespaces(Unit *u, const ExecContext *c) { |
| + assert(c); |
| + |
| + if (!exec_context_restrict_namespaces_set(c)) |
| + return 0; |
| + |
| + if (skip_seccomp_unavailable(u, "RestrictNamespaces=")) |
| + return 0; |
| + |
| + return seccomp_restrict_namespaces(c->restrict_namespaces); |
| +} |
| + |
| #endif |
| |
| static void do_idle_pipe_dance(int idle_pipe[4]) { |
| @@ -2183,6 +2195,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) { |
| return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */ |
| c->memory_deny_write_execute || |
| c->restrict_realtime || |
| + exec_context_restrict_namespaces_set(c) || |
| c->protect_kernel_tunables || |
| c->protect_kernel_modules || |
| c->private_devices || |
| @@ -2764,6 +2777,12 @@ static int exec_child( |
| } |
| } |
| |
| + r = apply_restrict_namespaces(unit, context); |
| + if (r < 0) { |
| + *exit_status = EXIT_SECCOMP; |
| + return r; |
| + } |
| + |
| if (context->protect_kernel_tunables) { |
| r = apply_protect_sysctl(unit, context); |
| if (r < 0) { |
| @@ -2947,6 +2966,7 @@ void exec_context_init(ExecContext *c) { |
| c->personality = PERSONALITY_INVALID; |
| c->runtime_directory_mode = 0755; |
| c->capability_bounding_set = CAP_ALL; |
| + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; |
| } |
| |
| void exec_context_done(ExecContext *c) { |
| @@ -3244,6 +3264,7 @@ static void strv_fprintf(FILE *f, char **l) { |
| void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { |
| char **e, **d; |
| unsigned i; |
| + int r; |
| |
| assert(c); |
| assert(f); |
| @@ -3524,6 +3545,15 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) { |
| fputc('\n', f); |
| } |
| |
| + if (exec_context_restrict_namespaces_set(c)) { |
| + _cleanup_free_ char *s = NULL; |
| + |
| + r = namespace_flag_to_string_many(c->restrict_namespaces, &s); |
| + if (r >= 0) |
| + fprintf(f, "%sRestrictNamespaces: %s\n", |
| + prefix, s); |
| + } |
| + |
| if (c->syscall_errno > 0) |
| fprintf(f, |
| "%sSystemCallErrorNumber: %s\n", |
| diff --git a/src/core/execute.h b/src/core/execute.h |
| index c7d0f7761e..56f880cffe 100644 |
| --- a/src/core/execute.h |
| +++ b/src/core/execute.h |
| @@ -35,6 +35,7 @@ typedef struct ExecParameters ExecParameters; |
| #include "list.h" |
| #include "missing.h" |
| #include "namespace.h" |
| +#include "nsflags.h" |
| |
| typedef enum ExecUtmpMode { |
| EXEC_UTMP_INIT, |
| @@ -195,6 +196,8 @@ struct ExecContext { |
| |
| unsigned long personality; |
| |
| + unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */ |
| + |
| Set *syscall_filter; |
| Set *syscall_archs; |
| int syscall_errno; |
| @@ -216,6 +219,12 @@ struct ExecContext { |
| bool no_new_privileges_set:1; |
| }; |
| |
| +static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { |
| + assert(c); |
| + |
| + return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL; |
| +} |
| + |
| typedef enum ExecFlags { |
| EXEC_CONFIRM_SPAWN = 1U << 0, |
| EXEC_APPLY_PERMISSIONS = 1U << 1, |
| diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 |
| index af2f9d960b..cb2f384f47 100644 |
| --- a/src/core/load-fragment-gperf.gperf.m4 |
| +++ b/src/core/load-fragment-gperf.gperf.m4 |
| @@ -57,12 +57,14 @@ m4_ifdef(`HAVE_SECCOMP', |
| $1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs) |
| $1.SystemCallErrorNumber, config_parse_syscall_errno, 0, offsetof($1, exec_context) |
| $1.MemoryDenyWriteExecute, config_parse_bool, 0, offsetof($1, exec_context.memory_deny_write_execute) |
| +$1.RestrictNamespaces, config_parse_restrict_namespaces, 0, offsetof($1, exec_context.restrict_namespaces) |
| $1.RestrictRealtime, config_parse_bool, 0, offsetof($1, exec_context.restrict_realtime) |
| $1.RestrictAddressFamilies, config_parse_address_families, 0, offsetof($1, exec_context)', |
| `$1.SystemCallFilter, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| $1.SystemCallArchitectures, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| $1.SystemCallErrorNumber, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| $1.MemoryDenyWriteExecute, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| +$1.RestrictNamespaces, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| $1.RestrictRealtime, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 |
| $1.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0') |
| $1.LimitCPU, config_parse_limit, RLIMIT_CPU, offsetof($1, exec_context.rlimit) |
| diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c |
| index cbc826809e..e0fa484c1e 100644 |
| --- a/src/core/load-fragment.c |
| +++ b/src/core/load-fragment.c |
| @@ -2905,6 +2905,54 @@ int config_parse_address_families( |
| if (!isempty(state)) |
| log_syntax(unit, LOG_ERR, filename, line, 0, "Trailing garbage, ignoring."); |
| |
| + return 0; |
| +} |
| + |
| +int config_parse_restrict_namespaces( |
| + const char *unit, |
| + const char *filename, |
| + unsigned line, |
| + const char *section, |
| + unsigned section_line, |
| + const char *lvalue, |
| + int ltype, |
| + const char *rvalue, |
| + void *data, |
| + void *userdata) { |
| + |
| + ExecContext *c = data; |
| + bool invert = false; |
| + int r; |
| + |
| + if (isempty(rvalue)) { |
| + /* Reset to the default. */ |
| + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; |
| + return 0; |
| + } |
| + |
| + if (rvalue[0] == '~') { |
| + invert = true; |
| + rvalue++; |
| + } |
| + |
| + r = parse_boolean(rvalue); |
| + if (r > 0) |
| + c->restrict_namespaces = 0; |
| + else if (r == 0) |
| + c->restrict_namespaces = NAMESPACE_FLAGS_ALL; |
| + else { |
| + /* Not a boolean argument, in this case it's a list of namespace types. */ |
| + |
| + r = namespace_flag_from_string_many(rvalue, &c->restrict_namespaces); |
| + if (r < 0) { |
| + log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse namespace type string, ignoring: %s", rvalue); |
| + return 0; |
| + } |
| + } |
| + |
| + if (invert) |
| + c->restrict_namespaces = (~c->restrict_namespaces) & NAMESPACE_FLAGS_ALL; |
| + |
| return 0; |
| } |
| #endif |
| @@ -4327,6 +4375,7 @@ void unit_dump_config_items(FILE *f) { |
| { config_parse_syscall_archs, "ARCHS" }, |
| { config_parse_syscall_errno, "ERRNO" }, |
| { config_parse_address_families, "FAMILIES" }, |
| + { config_parse_restrict_namespaces, "NAMESPACES" }, |
| #endif |
| { config_parse_cpu_shares, "SHARES" }, |
| { config_parse_cpu_weight, "WEIGHT" }, |
| diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h |
| index c05f205c37..1cff815a50 100644 |
| --- a/src/core/load-fragment.h |
| +++ b/src/core/load-fragment.h |
| @@ -116,6 +116,7 @@ int config_parse_fdname(const char *unit, const char *filename, unsigned line, c |
| int config_parse_sec_fix_0(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); |
| int config_parse_user_group(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); |
| int config_parse_user_group_strv(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); |
| +int config_parse_restrict_namespaces(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); |
| |
| /* gperf prototypes */ |
| const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, unsigned length); |
| diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c |
| index f639e0e832..35e2c8f18e 100644 |
| --- a/src/shared/bus-unit-util.c |
| +++ b/src/shared/bus-unit-util.c |
| @@ -27,6 +27,7 @@ |
| #include "hashmap.h" |
| #include "list.h" |
| #include "locale-util.h" |
| +#include "nsflags.h" |
| #include "parse-util.h" |
| #include "path-util.h" |
| #include "process-util.h" |
| @@ -553,6 +554,30 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen |
| |
| r = sd_bus_message_close_container(m); |
| |
| + } else if (streq(field, "RestrictNamespaces")) { |
| + bool invert = false; |
| + uint64_t flags = 0; |
| + |
| + if (eq[0] == '~') { |
| + invert = true; |
| + eq++; |
| + } |
| + |
| + r = parse_boolean(eq); |
| + if (r > 0) |
| + flags = 0; |
| + else if (r == 0) |
| + flags = NAMESPACE_FLAGS_ALL; |
| + else { |
| + r = namespace_flag_from_string_many(eq, &flags); |
| + if (r < 0) |
| + return log_error_errno(r, "Failed to parse %s value %s.", field, eq); |
| + } |
| + |
| + if (invert) |
| + flags = (~flags) & NAMESPACE_FLAGS_ALL; |
| + |
| + r = sd_bus_message_append(m, "v", "t", flags); |
| } else { |
| log_error("Unknown assignment %s.", assignment); |
| return -EINVAL; |
| diff --git a/src/shared/nsflags.c b/src/shared/nsflags.c |
| new file mode 100644 |
| index 0000000000..8fcbe97ba7 |
| --- /dev/null |
| +++ b/src/shared/nsflags.c |
| @@ -0,0 +1,126 @@ |
| +/*** |
| + This file is part of systemd. |
| + |
| + Copyright 2016 Lennart Poettering |
| + |
| + systemd is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU Lesser General Public License as published by |
| + the Free Software Foundation; either version 2.1 of the License, or |
| + (at your option) any later version. |
| + |
| + systemd is distributed in the hope that it will be useful, but |
| + WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public License |
| + along with systemd; If not, see <http://www.gnu.org/licenses/>. |
| +***/ |
| + |
| +#include <sched.h> |
| + |
| +#include "alloc-util.h" |
| +#include "extract-word.h" |
| +#include "nsflags.h" |
| +#include "seccomp-util.h" |
| +#include "string-util.h" |
| + |
| +const struct namespace_flag_map namespace_flag_map[] = { |
| + { CLONE_NEWCGROUP, "cgroup" }, |
| + { CLONE_NEWIPC, "ipc" }, |
| + { CLONE_NEWNET, "net" }, |
| + /* So, the mount namespace flag is called CLONE_NEWNS for historical reasons. Let's expose it here under a more |
| + * explanatory name: "mnt". This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */ |
| + { CLONE_NEWNS, "mnt" }, |
| + { CLONE_NEWPID, "pid" }, |
| + { CLONE_NEWUSER, "user" }, |
| + { CLONE_NEWUTS, "uts" }, |
| + {} |
| +}; |
| + |
| +const char* namespace_flag_to_string(unsigned long flag) { |
| + unsigned i; |
| + |
| + flag &= NAMESPACE_FLAGS_ALL; |
| + |
| + for (i = 0; namespace_flag_map[i].name; i++) |
| + if (flag == namespace_flag_map[i].flag) |
| + return namespace_flag_map[i].name; |
| + |
| + return NULL; /* either unknown namespace flag, or a combination of many. This call supports neither. */ |
| +} |
| + |
| +unsigned long namespace_flag_from_string(const char *name) { |
| + unsigned i; |
| + |
| + if (isempty(name)) |
| + return 0; |
| + |
| + for (i = 0; namespace_flag_map[i].name; i++) |
| + if (streq(name, namespace_flag_map[i].name)) |
| + return namespace_flag_map[i].flag; |
| + |
| + return 0; |
| +} |
| + |
| +int namespace_flag_from_string_many(const char *name, unsigned long *ret) { |
| + unsigned long flags = 0; |
| + int r; |
| + |
| + assert_se(ret); |
| + |
| + if (!name) { |
| + *ret = 0; |
| + return 0; |
| + } |
| + |
| + for (;;) { |
| + _cleanup_free_ char *word = NULL; |
| + unsigned long f; |
| + |
| + r = extract_first_word(&name, &word, NULL, 0); |
| + if (r < 0) |
| + return r; |
| + if (r == 0) |
| + break; |
| + |
| + f = namespace_flag_from_string(word); |
| + if (f == 0) |
| + return -EINVAL; |
| + |
| + flags |= f; |
| + } |
| + |
| + *ret = flags; |
| + return 0; |
| +} |
| + |
| +int namespace_flag_to_string_many(unsigned long flags, char **ret) { |
| + _cleanup_free_ char *s = NULL; |
| + unsigned i; |
| + |
| + for (i = 0; namespace_flag_map[i].name; i++) { |
| + if ((flags & namespace_flag_map[i].flag) != namespace_flag_map[i].flag) |
| + continue; |
| + |
| + if (!s) { |
| + s = strdup(namespace_flag_map[i].name); |
| + if (!s) |
| + return -ENOMEM; |
| + } else { |
| + if (!strextend(&s, " ", namespace_flag_map[i].name, NULL)) |
| + return -ENOMEM; |
| + } |
| + } |
| + |
| + if (!s) { |
| + s = strdup(""); |
| + if (!s) |
| + return -ENOMEM; |
| + } |
| + |
| + *ret = s; |
| + s = NULL; |
| + |
| + return 0; |
| +} |
| diff --git a/src/shared/nsflags.h b/src/shared/nsflags.h |
| new file mode 100644 |
| index 0000000000..152ab8b936 |
| --- /dev/null |
| +++ b/src/shared/nsflags.h |
| @@ -0,0 +1,49 @@ |
| +#pragma once |
| + |
| +/*** |
| + This file is part of systemd. |
| + |
| + Copyright 2016 Lennart Poettering |
| + |
| + systemd is free software; you can redistribute it and/or modify it |
| + under the terms of the GNU Lesser General Public License as published by |
| + the Free Software Foundation; either version 2.1 of the License, or |
| + (at your option) any later version. |
| + |
| + systemd is distributed in the hope that it will be useful, but |
| + WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public License |
| + along with systemd; If not, see <http://www.gnu.org/licenses/>. |
| +***/ |
| + |
| +#include <sched.h> |
| + |
| +#include "missing.h" |
| + |
| +/* The combination of all namespace flags defined by the kernel. The right type for this isn't clear. setns() and |
| + * unshare() expect these flags to be passed as (signed) "int", while clone() wants them as "unsigned long". The latter |
| + * is definitely more appropriate for a flags parameter, and also the larger type of the two, hence let's stick to that |
| + * here. */ |
| +#define NAMESPACE_FLAGS_ALL \ |
| + ((unsigned long) (CLONE_NEWCGROUP| \ |
| + CLONE_NEWIPC| \ |
| + CLONE_NEWNET| \ |
| + CLONE_NEWNS| \ |
| + CLONE_NEWPID| \ |
| + CLONE_NEWUSER| \ |
| + CLONE_NEWUTS)) |
| + |
| +const char* namespace_flag_to_string(unsigned long flag); |
| +unsigned long namespace_flag_from_string(const char *name); |
| +int namespace_flag_from_string_many(const char *name, unsigned long *ret); |
| +int namespace_flag_to_string_many(unsigned long flags, char **ret); |
| + |
| +struct namespace_flag_map { |
| + unsigned long flag; |
| + const char *name; |
| +}; |
| + |
| +extern const struct namespace_flag_map namespace_flag_map[]; |
| diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c |
| index c9b24f1065..55b97e1efb 100644 |
| --- a/src/shared/seccomp-util.c |
| +++ b/src/shared/seccomp-util.c |
| @@ -23,7 +23,9 @@ |
| #include <sys/prctl.h> |
| #include <linux/seccomp.h> |
| |
| +#include "alloc-util.h" |
| #include "macro.h" |
| +#include "nsflags.h" |
| #include "seccomp-util.h" |
| #include "string-util.h" |
| #include "util.h" |
| @@ -574,5 +576,92 @@ int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set |
| finish: |
| seccomp_release(seccomp); |
| return r; |
| +} |
| + |
| +int seccomp_restrict_namespaces(unsigned long retain) { |
| + scmp_filter_ctx seccomp; |
| + unsigned i; |
| + int r; |
| + |
| + if (log_get_max_level() >= LOG_DEBUG) { |
| + _cleanup_free_ char *s = NULL; |
| + |
| + (void) namespace_flag_to_string_many(retain, &s); |
| + log_debug("Restricting namespace to: %s.", strna(s)); |
| + } |
| + |
| + /* NOOP? */ |
| + if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL) |
| + return 0; |
| + |
| + r = seccomp_init_conservative(&seccomp, SCMP_ACT_ALLOW); |
| + if (r < 0) |
| + return r; |
| + |
| + if ((retain & NAMESPACE_FLAGS_ALL) == 0) |
| + /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall |
| + * altogether. */ |
| + r = seccomp_rule_add( |
| + seccomp, |
| + SCMP_ACT_ERRNO(EPERM), |
| + SCMP_SYS(setns), |
| + 0); |
| + else |
| + /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the |
| + * special invocation with a zero flags argument, right here. */ |
| + r = seccomp_rule_add( |
| + seccomp, |
| + SCMP_ACT_ERRNO(EPERM), |
| + SCMP_SYS(setns), |
| + 1, |
| + SCMP_A1(SCMP_CMP_EQ, 0)); |
| + if (r < 0) |
| + goto finish; |
| + |
| + for (i = 0; namespace_flag_map[i].name; i++) { |
| + unsigned long f; |
| + |
| + f = namespace_flag_map[i].flag; |
| + if ((retain & f) == f) { |
| + log_debug("Permitting %s.", namespace_flag_map[i].name); |
| + continue; |
| + } |
| + |
| + log_debug("Blocking %s.", namespace_flag_map[i].name); |
| + |
| + r = seccomp_rule_add( |
| + seccomp, |
| + SCMP_ACT_ERRNO(EPERM), |
| + SCMP_SYS(unshare), |
| + 1, |
| + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); |
| + if (r < 0) |
| + goto finish; |
| + |
| + r = seccomp_rule_add( |
| + seccomp, |
| + SCMP_ACT_ERRNO(EPERM), |
| + SCMP_SYS(clone), |
| + 1, |
| + SCMP_A0(SCMP_CMP_MASKED_EQ, f, f)); |
| + if (r < 0) |
| + goto finish; |
| + |
| + if ((retain & NAMESPACE_FLAGS_ALL) != 0) { |
| + r = seccomp_rule_add( |
| + seccomp, |
| + SCMP_ACT_ERRNO(EPERM), |
| + SCMP_SYS(setns), |
| + 1, |
| + SCMP_A1(SCMP_CMP_MASKED_EQ, f, f)); |
| + if (r < 0) |
| + goto finish; |
| + } |
| + } |
| + |
| + r = seccomp_load(seccomp); |
| |
| +finish: |
| + seccomp_release(seccomp); |
| + return r; |
| } |
| diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h |
| index 8e209efef2..e325dab628 100644 |
| --- a/src/shared/seccomp-util.h |
| +++ b/src/shared/seccomp-util.h |
| @@ -64,3 +64,5 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name); |
| int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action); |
| |
| int seccomp_load_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action); |
| + |
| +int seccomp_restrict_namespaces(unsigned long retain); |
| diff --git a/src/test/test-seccomp.c b/src/test/test-seccomp.c |
| index 43d1567288..beb6a7f422 100644 |
| --- a/src/test/test-seccomp.c |
| +++ b/src/test/test-seccomp.c |
| @@ -20,10 +20,15 @@ |
| #include <stdlib.h> |
| #include <sys/eventfd.h> |
| #include <unistd.h> |
| +#include <sched.h> |
| |
| +#include "alloc-util.h" |
| #include "fd-util.h" |
| #include "macro.h" |
| +#include "missing.h" |
| +#include "nsflags.h" |
| #include "process-util.h" |
| +#include "raw-clone.h" |
| #include "seccomp-util.h" |
| #include "string-util.h" |
| #include "util.h" |
| @@ -125,12 +130,101 @@ static void test_filter_sets(void) { |
| } |
| } |
| |
| +static void test_restrict_namespace(void) { |
| + _cleanup_free_ char *s = NULL; |
| + pid_t pid; |
| + unsigned long ul; |
| + |
| + assert_se(namespace_flag_to_string(0) == NULL); |
| + assert_se(streq(namespace_flag_to_string(CLONE_NEWNS), "mnt")); |
| + assert_se(namespace_flag_to_string(CLONE_NEWNS|CLONE_NEWIPC) == NULL); |
| + assert_se(streq(namespace_flag_to_string(CLONE_NEWCGROUP), "cgroup")); |
| + |
| + assert_se(namespace_flag_from_string("mnt") == CLONE_NEWNS); |
| + assert_se(namespace_flag_from_string(NULL) == 0); |
| + assert_se(namespace_flag_from_string("") == 0); |
| + assert_se(namespace_flag_from_string("uts") == CLONE_NEWUTS); |
| + assert_se(namespace_flag_from_string(namespace_flag_to_string(CLONE_NEWUTS)) == CLONE_NEWUTS); |
| + assert_se(streq(namespace_flag_to_string(namespace_flag_from_string("ipc")), "ipc")); |
| + |
| + assert_se(namespace_flag_from_string_many(NULL, &ul) == 0 && ul == 0); |
| + assert_se(namespace_flag_from_string_many("", &ul) == 0 && ul == 0); |
| + assert_se(namespace_flag_from_string_many("mnt uts ipc", &ul) == 0 && ul == (CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC)); |
| + |
| + assert_se(namespace_flag_to_string_many(NAMESPACE_FLAGS_ALL, &s) == 0); |
| + assert_se(streq(s, "cgroup ipc net mnt pid user uts")); |
| + assert_se(namespace_flag_from_string_many(s, &ul) == 0 && ul == NAMESPACE_FLAGS_ALL); |
| + |
| + if (!is_seccomp_available()) |
| + return; |
| + |
| + if (geteuid() != 0) |
| + return; |
| + |
| + pid = fork(); |
| + assert_se(pid >= 0); |
| + |
| + if (pid == 0) { |
| + |
| + assert_se(seccomp_restrict_namespaces(CLONE_NEWNS|CLONE_NEWNET) >= 0); |
| + |
| + assert_se(unshare(CLONE_NEWNS) == 0); |
| + assert_se(unshare(CLONE_NEWNET) == 0); |
| + assert_se(unshare(CLONE_NEWUTS) == -1); |
| + assert_se(errno == EPERM); |
| + assert_se(unshare(CLONE_NEWIPC) == -1); |
| + assert_se(errno == EPERM); |
| + assert_se(unshare(CLONE_NEWNET|CLONE_NEWUTS) == -1); |
| + assert_se(errno == EPERM); |
| + |
| + /* We use fd 0 (stdin) here, which of course will fail with EINVAL on setns(). Except of course our |
| + * seccomp filter worked, and hits first and makes it return EPERM */ |
| + assert_se(setns(0, CLONE_NEWNS) == -1); |
| + assert_se(errno == EINVAL); |
| + assert_se(setns(0, CLONE_NEWNET) == -1); |
| + assert_se(errno == EINVAL); |
| + assert_se(setns(0, CLONE_NEWUTS) == -1); |
| + assert_se(errno == EPERM); |
| + assert_se(setns(0, CLONE_NEWIPC) == -1); |
| + assert_se(errno == EPERM); |
| + assert_se(setns(0, CLONE_NEWNET|CLONE_NEWUTS) == -1); |
| + assert_se(errno == EPERM); |
| + assert_se(setns(0, 0) == -1); |
| + assert_se(errno == EPERM); |
| + |
| + pid = raw_clone(CLONE_NEWNS); |
| + assert_se(pid >= 0); |
| + if (pid == 0) |
| + _exit(EXIT_SUCCESS); |
| + pid = raw_clone(CLONE_NEWNET); |
| + assert_se(pid >= 0); |
| + if (pid == 0) |
| + _exit(EXIT_SUCCESS); |
| + pid = raw_clone(CLONE_NEWUTS); |
| + assert_se(pid < 0); |
| + assert_se(errno == EPERM); |
| + pid = raw_clone(CLONE_NEWIPC); |
| + assert_se(pid < 0); |
| + assert_se(errno == EPERM); |
| + pid = raw_clone(CLONE_NEWNET|CLONE_NEWUTS); |
| + assert_se(pid < 0); |
| + assert_se(errno == EPERM); |
| + |
| + _exit(EXIT_SUCCESS); |
| + } |
| + |
| + assert_se(wait_for_terminate_and_warn("nsseccomp", pid, true) == EXIT_SUCCESS); |
| +} |
| + |
| int main(int argc, char *argv[]) { |
| |
| + log_set_max_level(LOG_DEBUG); |
| + |
| test_seccomp_arch_to_string(); |
| test_architecture_table(); |
| test_syscall_filter_set_find(); |
| test_filter_sets(); |
| + test_restrict_namespace(); |
| |
| return 0; |
| } |