drop_monitor: Add packet alert mode
authorIdo Schimmel <idosch@mellanox.com>
Sun, 11 Aug 2019 07:35:51 +0000 (10:35 +0300)
committerDavid S. Miller <davem@davemloft.net>
Sun, 11 Aug 2019 17:53:30 +0000 (10:53 -0700)
So far drop monitor supported only one alert mode in which a summary of
locations in which packets were recently dropped was sent to user space.

This alert mode is sufficient in order to understand that packets were
dropped, but lacks information to perform a more detailed analysis.

Add a new alert mode in which the dropped packet itself is passed to
user space along with metadata: The drop location (as program counter
and resolved symbol), ingress netdevice and drop timestamp. More
metadata can be added in the future.

To avoid performing expensive operations in the context in which
kfree_skb() is invoked (can be hard IRQ), the dropped skb is cloned and
queued on per-CPU skb drop list. Then, in process context the netlink
message is allocated, prepared and finally sent to user space.

The per-CPU skb drop list is limited to 1000 skbs to prevent exhausting
the system's memory. Subsequent patches will make this limit
configurable and also add a counter that indicates how many skbs were
tail dropped.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/uapi/linux/net_dropmon.h
net/core/drop_monitor.c

index 0fecdedeb6ca28c248c84f9db5b81360c9ace7b9..cfaaf75371b8316234f005b456e737c795d62637 100644 (file)
@@ -53,6 +53,7 @@ enum {
        NET_DM_CMD_CONFIG,
        NET_DM_CMD_START,
        NET_DM_CMD_STOP,
+       NET_DM_CMD_PACKET_ALERT,
        _NET_DM_CMD_MAX,
 };
 
@@ -63,12 +64,38 @@ enum {
  */
 #define NET_DM_GRP_ALERT 1
 
+enum net_dm_attr {
+       NET_DM_ATTR_UNSPEC,
+
+       NET_DM_ATTR_ALERT_MODE,                 /* u8 */
+       NET_DM_ATTR_PC,                         /* u64 */
+       NET_DM_ATTR_SYMBOL,                     /* string */
+       NET_DM_ATTR_IN_PORT,                    /* nested */
+       NET_DM_ATTR_TIMESTAMP,                  /* struct timespec */
+       NET_DM_ATTR_PROTO,                      /* u16 */
+       NET_DM_ATTR_PAYLOAD,                    /* binary */
+       NET_DM_ATTR_PAD,
+
+       __NET_DM_ATTR_MAX,
+       NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
+};
+
 /**
  * enum net_dm_alert_mode - Alert mode.
  * @NET_DM_ALERT_MODE_SUMMARY: A summary of recent drops is sent to user space.
+ * @NET_DM_ALERT_MODE_PACKET: Each dropped packet is sent to user space along
+ *                            with metadata.
  */
 enum net_dm_alert_mode {
        NET_DM_ALERT_MODE_SUMMARY,
+       NET_DM_ALERT_MODE_PACKET,
+};
+
+enum {
+       NET_DM_ATTR_PORT_NETDEV_IFINDEX,        /* u32 */
+
+       __NET_DM_ATTR_PORT_MAX,
+       NET_DM_ATTR_PORT_MAX = __NET_DM_ATTR_PORT_MAX - 1
 };
 
 #endif
index 9cd2f662cb9ee8f413596593d3b2df770dbf5d28..ba765832413b425aac9f00537aa9c4403540091a 100644 (file)
@@ -54,6 +54,7 @@ static DEFINE_MUTEX(net_dm_mutex);
 struct per_cpu_dm_data {
        spinlock_t              lock;   /* Protects 'skb' and 'send_timer' */
        struct sk_buff          *skb;
+       struct sk_buff_head     drop_queue;
        struct work_struct      dm_alert_work;
        struct timer_list       send_timer;
 };
@@ -85,6 +86,14 @@ struct net_dm_alert_ops {
        void (*work_item_func)(struct work_struct *work);
 };
 
+struct net_dm_skb_cb {
+       void *pc;
+};
+
+#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
+
+#define NET_DM_QUEUE_LEN 1000
+
 static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
 {
        size_t al;
@@ -257,8 +266,214 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
        .work_item_func         = send_dm_alert,
 };
 
+static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
+                                             struct sk_buff *skb,
+                                             void *location)
+{
+       ktime_t tstamp = ktime_get_real();
+       struct per_cpu_dm_data *data;
+       struct sk_buff *nskb;
+       unsigned long flags;
+
+       nskb = skb_clone(skb, GFP_ATOMIC);
+       if (!nskb)
+               return;
+
+       NET_DM_SKB_CB(nskb)->pc = location;
+       /* Override the timestamp because we care about the time when the
+        * packet was dropped.
+        */
+       nskb->tstamp = tstamp;
+
+       data = this_cpu_ptr(&dm_cpu_data);
+
+       spin_lock_irqsave(&data->drop_queue.lock, flags);
+       if (skb_queue_len(&data->drop_queue) < NET_DM_QUEUE_LEN)
+               __skb_queue_tail(&data->drop_queue, nskb);
+       else
+               goto unlock_free;
+       spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+       schedule_work(&data->dm_alert_work);
+
+       return;
+
+unlock_free:
+       spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+       consume_skb(nskb);
+}
+
+static void net_dm_packet_trace_napi_poll_hit(void *ignore,
+                                             struct napi_struct *napi,
+                                             int work, int budget)
+{
+}
+
+static size_t net_dm_in_port_size(void)
+{
+              /* NET_DM_ATTR_IN_PORT nest */
+       return nla_total_size(0) +
+              /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
+              nla_total_size(sizeof(u32));
+}
+
+#define NET_DM_MAX_SYMBOL_LEN 40
+
+static size_t net_dm_packet_report_size(size_t payload_len)
+{
+       size_t size;
+
+       size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+       return NLMSG_ALIGN(size) +
+              /* NET_DM_ATTR_PC */
+              nla_total_size(sizeof(u64)) +
+              /* NET_DM_ATTR_SYMBOL */
+              nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) +
+              /* NET_DM_ATTR_IN_PORT */
+              net_dm_in_port_size() +
+              /* NET_DM_ATTR_TIMESTAMP */
+              nla_total_size(sizeof(struct timespec)) +
+              /* NET_DM_ATTR_PROTO */
+              nla_total_size(sizeof(u16)) +
+              /* NET_DM_ATTR_PAYLOAD */
+              nla_total_size(payload_len);
+}
+
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex)
+{
+       struct nlattr *attr;
+
+       attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT);
+       if (!attr)
+               return -EMSGSIZE;
+
+       if (ifindex &&
+           nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
+               goto nla_put_failure;
+
+       nla_nest_end(msg, attr);
+
+       return 0;
+
+nla_put_failure:
+       nla_nest_cancel(msg, attr);
+       return -EMSGSIZE;
+}
+
+static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
+                                    size_t payload_len)
+{
+       u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+       char buf[NET_DM_MAX_SYMBOL_LEN];
+       struct nlattr *attr;
+       struct timespec ts;
+       void *hdr;
+       int rc;
+
+       hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+                         NET_DM_CMD_PACKET_ALERT);
+       if (!hdr)
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+               goto nla_put_failure;
+
+       snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+       if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
+               goto nla_put_failure;
+
+       rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif);
+       if (rc)
+               goto nla_put_failure;
+
+       if (ktime_to_timespec_cond(skb->tstamp, &ts) &&
+           nla_put(msg, NET_DM_ATTR_TIMESTAMP, sizeof(ts), &ts))
+               goto nla_put_failure;
+
+       if (!payload_len)
+               goto out;
+
+       if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+               goto nla_put_failure;
+
+       attr = skb_put(msg, nla_total_size(payload_len));
+       attr->nla_type = NET_DM_ATTR_PAYLOAD;
+       attr->nla_len = nla_attr_size(payload_len);
+       if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+               goto nla_put_failure;
+
+out:
+       genlmsg_end(msg, hdr);
+
+       return 0;
+
+nla_put_failure:
+       genlmsg_cancel(msg, hdr);
+       return -EMSGSIZE;
+}
+
+#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO)
+
+static void net_dm_packet_report(struct sk_buff *skb)
+{
+       struct sk_buff *msg;
+       size_t payload_len;
+       int rc;
+
+       /* Make sure we start copying the packet from the MAC header */
+       if (skb->data > skb_mac_header(skb))
+               skb_push(skb, skb->data - skb_mac_header(skb));
+       else
+               skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+       /* Ensure packet fits inside a single netlink attribute */
+       payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+
+       msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+       if (!msg)
+               goto out;
+
+       rc = net_dm_packet_report_fill(msg, skb, payload_len);
+       if (rc) {
+               nlmsg_free(msg);
+               goto out;
+       }
+
+       genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+       consume_skb(skb);
+}
+
+static void net_dm_packet_work(struct work_struct *work)
+{
+       struct per_cpu_dm_data *data;
+       struct sk_buff_head list;
+       struct sk_buff *skb;
+       unsigned long flags;
+
+       data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+       __skb_queue_head_init(&list);
+
+       spin_lock_irqsave(&data->drop_queue.lock, flags);
+       skb_queue_splice_tail_init(&data->drop_queue, &list);
+       spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+       while ((skb = __skb_dequeue(&list)))
+               net_dm_packet_report(skb);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
+       .kfree_skb_probe        = net_dm_packet_trace_kfree_skb_hit,
+       .napi_poll_probe        = net_dm_packet_trace_napi_poll_hit,
+       .work_item_func         = net_dm_packet_work,
+};
+
 static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
        [NET_DM_ALERT_MODE_SUMMARY]     = &net_dm_alert_summary_ops,
+       [NET_DM_ALERT_MODE_PACKET]      = &net_dm_alert_packet_ops,
 };
 
 static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
@@ -326,9 +541,12 @@ static void net_dm_trace_off_set(void)
         */
        for_each_possible_cpu(cpu) {
                struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+               struct sk_buff *skb;
 
                del_timer_sync(&data->send_timer);
                cancel_work_sync(&data->dm_alert_work);
+               while ((skb = __skb_dequeue(&data->drop_queue)))
+                       consume_skb(skb);
        }
 
        list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
@@ -370,12 +588,61 @@ static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
        return rc;
 }
 
+static int net_dm_alert_mode_get_from_info(struct genl_info *info,
+                                          enum net_dm_alert_mode *p_alert_mode)
+{
+       u8 val;
+
+       val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
+
+       switch (val) {
+       case NET_DM_ALERT_MODE_SUMMARY: /* fall-through */
+       case NET_DM_ALERT_MODE_PACKET:
+               *p_alert_mode = val;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int net_dm_alert_mode_set(struct genl_info *info)
+{
+       struct netlink_ext_ack *extack = info->extack;
+       enum net_dm_alert_mode alert_mode;
+       int rc;
+
+       if (!info->attrs[NET_DM_ATTR_ALERT_MODE])
+               return 0;
+
+       rc = net_dm_alert_mode_get_from_info(info, &alert_mode);
+       if (rc) {
+               NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode");
+               return -EINVAL;
+       }
+
+       net_dm_alert_mode = alert_mode;
+
+       return 0;
+}
+
 static int net_dm_cmd_config(struct sk_buff *skb,
                        struct genl_info *info)
 {
-       NL_SET_ERR_MSG_MOD(info->extack, "Command not supported");
+       struct netlink_ext_ack *extack = info->extack;
+       int rc;
 
-       return -EOPNOTSUPP;
+       if (trace_state == TRACE_ON) {
+               NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor while tracing is on");
+               return -EBUSY;
+       }
+
+       rc = net_dm_alert_mode_set(info);
+       if (rc)
+               return rc;
+
+       return 0;
 }
 
 static int net_dm_cmd_trace(struct sk_buff *skb,
@@ -430,6 +697,11 @@ out:
        return NOTIFY_DONE;
 }
 
+static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
+       [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
+       [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+};
+
 static const struct genl_ops dropmon_ops[] = {
        {
                .cmd = NET_DM_CMD_CONFIG,
@@ -467,6 +739,8 @@ static struct genl_family net_drop_monitor_family __ro_after_init = {
        .hdrsize        = 0,
        .name           = "NET_DM",
        .version        = 2,
+       .maxattr        = NET_DM_ATTR_MAX,
+       .policy         = net_dm_nl_policy,
        .pre_doit       = net_dm_nl_pre_doit,
        .post_doit      = net_dm_nl_post_doit,
        .module         = THIS_MODULE,
@@ -510,6 +784,7 @@ static int __init init_net_drop_monitor(void)
        for_each_possible_cpu(cpu) {
                data = &per_cpu(dm_cpu_data, cpu);
                spin_lock_init(&data->lock);
+               skb_queue_head_init(&data->drop_queue);
        }
 
        goto out;
@@ -539,6 +814,7 @@ static void exit_net_drop_monitor(void)
                 * to this struct and can free the skb inside it
                 */
                kfree_skb(data->skb);
+               WARN_ON(!skb_queue_empty(&data->drop_queue));
        }
 
        BUG_ON(genl_unregister_family(&net_drop_monitor_family));