procd: add service instance watchdog
authorDaniel Bailey <danielb@meshplusplus.com>
Mon, 13 Jul 2020 22:05:31 +0000 (15:05 -0700)
committerDaniel Golle <daniel@makrotopia.org>
Mon, 13 Jul 2020 23:16:09 +0000 (00:16 +0100)
Added instance watchdog which will eventually either terminate
or respawn an instance depending on the instance respawn setting.

Added service ubus method 'watchdog' which services the watchdog
timer and allows update of the instance watchdog mode instance.

Two modes: disabled or passive.

Disabled: cancels watchdog timer set for a given instance.

Passive: sets a instance timer which must be serviced or the
instance will be stopped/restarted (dependent upon the instance
respawn value) when the timer expires.

Signed-off-by: Daniel Bailey <danielb@meshplusplus.com>
service/instance.c
service/instance.h
service/service.c

index c65da5051e936a47046582144bb46db0519dd7f6..c83e2277fcd2d60ef20d37648a46046a4bd3c844 100644 (file)
@@ -66,6 +66,7 @@ enum {
        INSTANCE_ATTR_OVERLAYDIR,
        INSTANCE_ATTR_TMPOVERLAYSIZE,
        INSTANCE_ATTR_BUNDLE,
+       INSTANCE_ATTR_WATCHDOG,
        __INSTANCE_ATTR_MAX
 };
 
@@ -97,6 +98,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
        [INSTANCE_ATTR_OVERLAYDIR] = { "overlaydir", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_TMPOVERLAYSIZE] = { "tmpoverlaysize", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_BUNDLE] = { "bundle", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_WATCHDOG] = { "watchdog", BLOBMSG_TYPE_ARRAY },
 };
 
 enum {
@@ -553,6 +555,11 @@ instance_start(struct service_instance *in)
                fcntl(epipe[0], F_SETFD, FD_CLOEXEC);
        }
 
+       if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {
+               uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);
+               DEBUG(2, "Started instance %s::%s watchdog timer : timeout = %d\n", in->srv->name, in->name, in->watchdog.freq);
+       }
+
        service_event("instance.start", in->srv->name, in->name);
 }
 
@@ -700,6 +707,7 @@ instance_exit(struct uloop_process *p, int ret)
 
        in->exit_code = instance_exit_code(ret);
        uloop_timeout_cancel(&in->timeout);
+       uloop_timeout_cancel(&in->watchdog.timeout);
        service_event("instance.stop", in->srv->name, in->name);
 
        if (in->halt) {
@@ -759,6 +767,19 @@ instance_restart(struct service_instance *in)
        uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
+static void
+instance_watchdog(struct uloop_timeout *t)
+{
+       struct service_instance *in = container_of(t, struct service_instance, watchdog.timeout);
+
+       DEBUG(3, "instance %s::%s watchdog timer expired\n", in->srv->name, in->name);
+
+       if (in->respawn)
+               instance_restart(in);
+       else
+               instance_stop(in, true);
+}
+
 static bool string_changed(const char *a, const char *b)
 {
        return !((!a && !b) || (a && b && !strcmp(a, b)));
@@ -825,6 +846,12 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
        if (!blobmsg_list_equal(&in->errors, &in_new->errors))
                return true;
 
+       if (in->watchdog.mode != in_new->watchdog.mode)
+               return true;
+
+       if (in->watchdog.freq != in_new->watchdog.freq)
+               return true;
+
        return false;
 }
 
@@ -1184,6 +1211,35 @@ instance_config_parse(struct service_instance *in)
                        DEBUG(3, "unknown syslog facility '%s' given, using default (LOG_DAEMON)\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
        }
 
+       if (tb[INSTANCE_ATTR_WATCHDOG]) {
+               int i = 0;
+               uint32_t vals[2] = { 0, 30 };
+
+               blobmsg_for_each_attr(cur2, tb[INSTANCE_ATTR_WATCHDOG], rem) {
+                       if (i >= 2)
+                               break;
+
+                       vals[i] = atoi(blobmsg_get_string(cur2));
+                       i++;
+               }
+
+               if (vals[0] >= 0 && vals[0] < __INSTANCE_WATCHDOG_MODE_MAX) {
+                       in->watchdog.mode = vals[0];
+                       DEBUG(3, "setting watchdog mode (%d)\n", vals[0]);
+               } else {
+                       in->watchdog.mode = 0;
+                       DEBUG(3, "unknown watchdog mode (%d) given, using default (0)\n", vals[0]);
+               }
+
+               if (vals[1] > 0) {
+                       in->watchdog.freq = vals[1];
+                       DEBUG(3, "setting watchdog timeout (%d)\n", vals[0]);
+               } else {
+                       in->watchdog.freq = 30;
+                       DEBUG(3, "invalid watchdog timeout (%d) given, using default (30)\n", vals[1]);
+               }
+       }
+
        return true;
 }
 
@@ -1269,6 +1325,7 @@ instance_free(struct service_instance *in)
        instance_free_stdio(in);
        uloop_process_delete(&in->proc);
        uloop_timeout_cancel(&in->timeout);
+       uloop_timeout_cancel(&in->watchdog.timeout);
        trigger_del(in);
        watch_del(in);
        instance_config_cleanup(in);
@@ -1323,6 +1380,9 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *
        blobmsg_list_simple_init(&in->limits);
        blobmsg_list_simple_init(&in->errors);
        blobmsg_list_simple_init(&in->jail.mount);
+
+       in->watchdog.timeout.cb = instance_watchdog;
+
        in->valid = instance_config_parse(in);
 }
 
@@ -1444,5 +1504,12 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
        if (verbose && in->trigger)
                blobmsg_add_blob(b, in->trigger);
 
+       if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {
+               void *r = blobmsg_open_table(b, "watchdog");
+               blobmsg_add_u32(b, "mode", in->watchdog.mode);
+               blobmsg_add_u32(b, "timeout", in->watchdog.freq);
+               blobmsg_close_table(b, r);
+       }
+
        blobmsg_close_table(b, i);
 }
index e8ee15caa600811c9728f731e48317bf30c2fc63..bb8a0c4b2fa2e25b5713e10f7cd7f847a37a9996 100644 (file)
@@ -39,6 +39,19 @@ struct jail {
        int argc;
 };
 
+typedef enum instance_watchdog {
+       INSTANCE_WATCHDOG_MODE_DISABLED,
+       INSTANCE_WATCHDOG_MODE_PASSIVE,
+       INSTANCE_WATCHDOG_MODE_ACTIVE,
+       __INSTANCE_WATCHDOG_MODE_MAX,
+} instance_watchdog_mode_t;
+
+struct watchdog {
+       instance_watchdog_mode_t mode;
+       uint32_t freq;
+       struct uloop_timeout timeout;
+};
+
 struct service_instance {
        struct vlist_node node;
        struct service *srv;
@@ -95,6 +108,8 @@ struct service_instance {
        struct blobmsg_list file;
        struct blobmsg_list limits;
        struct blobmsg_list errors;
+
+       struct watchdog watchdog;
 };
 
 void instance_start(struct service_instance *in);
index fcf021556df8910614187baa13709cd9d8e9230e..9a174bc7fee896d58b81473b02b8e0f9c6a671e6 100644 (file)
@@ -784,6 +784,71 @@ err_console_fd:
        return UBUS_STATUS_INVALID_ARGUMENT;
 }
 
+enum {
+       SERVICE_WATCHDOG_MODE,
+       SERVICE_WATCHDOG_TIMEOUT,
+       SERVICE_WATCHDOG_NAME,
+       SERVICE_WATCHDOG_INSTANCE,
+       __SERVICE_WATCHDOG_MAX,
+};
+
+static const struct blobmsg_policy service_watchdog_policy[__SERVICE_WATCHDOG_MAX] = {
+       [SERVICE_WATCHDOG_MODE] = { "mode", BLOBMSG_TYPE_INT32 },
+       [SERVICE_WATCHDOG_NAME] = { "name", BLOBMSG_TYPE_STRING },
+       [SERVICE_WATCHDOG_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
+       [SERVICE_WATCHDOG_INSTANCE] = { "instance", BLOBMSG_TYPE_STRING },
+};
+
+static int
+service_handle_watchdog(struct ubus_context *ctx, struct ubus_object *obj,
+                   struct ubus_request_data *req, const char *method,
+                   struct blob_attr *msg)
+{
+       struct blob_attr *tb[__SERVICE_WATCHDOG_MAX] = {0};
+       struct service *s;
+       struct blob_attr *cur;
+       struct service_instance *in;
+
+       blobmsg_parse(service_watchdog_policy, __SERVICE_WATCHDOG_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
+       cur = tb[SERVICE_WATCHDOG_NAME];
+       if (!cur)
+               return UBUS_STATUS_NOT_FOUND;
+
+       s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+       if (!s)
+               return UBUS_STATUS_NOT_FOUND;
+
+       cur = tb[SERVICE_WATCHDOG_INSTANCE];
+       if (!cur)
+               return UBUS_STATUS_NOT_FOUND;
+
+       in = vlist_find(&s->instances, blobmsg_data(cur), in, node);
+       if (!in) {
+               ERROR("instance %s not found\n", blobmsg_get_string(cur));
+               return UBUS_STATUS_NOT_FOUND;
+       }
+
+       if (tb[SERVICE_WATCHDOG_MODE])
+               in->watchdog.mode = blobmsg_get_u32(tb[SERVICE_WATCHDOG_MODE]);
+
+       if (tb[SERVICE_WATCHDOG_TIMEOUT])
+               in->watchdog.freq = blobmsg_get_u32(tb[SERVICE_WATCHDOG_TIMEOUT]);
+
+       if (in->watchdog.mode == INSTANCE_WATCHDOG_MODE_DISABLED)
+               uloop_timeout_cancel(&in->watchdog.timeout);
+       else
+               uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);
+
+       blob_buf_init(&b, 0);
+       blobmsg_add_string(&b, "name", blobmsg_get_string(tb[SERVICE_WATCHDOG_NAME]));
+       blobmsg_add_string(&b, "instance", blobmsg_get_string(tb[SERVICE_WATCHDOG_INSTANCE]));
+       blobmsg_add_u32(&b, "mode", in->watchdog.mode);
+       blobmsg_add_u32(&b, "timeout", in->watchdog.freq);
+
+       ubus_send_reply(ctx, req, b.head);
+
+       return UBUS_STATUS_OK;
+}
 
 static struct ubus_method main_object_methods[] = {
        UBUS_METHOD("set", service_handle_set, service_set_attrs),
@@ -797,6 +862,7 @@ static struct ubus_method main_object_methods[] = {
        UBUS_METHOD("validate", service_handle_validate, validate_policy),
        UBUS_METHOD("get_data", service_get_data, get_data_policy),
        UBUS_METHOD("state", service_handle_state, service_state_attrs),
+       UBUS_METHOD("watchdog", service_handle_watchdog, service_watchdog_policy),
 };
 
 static struct ubus_object_type main_object_type =