net: add a generic tracepoint for TX queue timeout
authorCong Wang <xiyou.wangcong@gmail.com>
Thu, 2 May 2019 02:56:59 +0000 (19:56 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sat, 4 May 2019 04:41:41 +0000 (00:41 -0400)
Although devlink health report does a nice job on reporting TX
timeout and other NIC errors, unfortunately it requires drivers
to support it but currently only mlx5 has implemented it.
Before other drivers could catch up, it is useful to have a
generic tracepoint to monitor this kind of TX timeout. We have
been suffering TX timeout with different drivers, we plan to
start to monitor it with rasdaemon which just needs a new tracepoint.

Sample output:

  ksoftirqd/1-16    [001] ..s2   144.043173: net_dev_xmit_timeout: dev=ens3 driver=e1000 queue=0

Cc: Eran Ben Elisha <eranbe@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/trace/events/net.h
net/sched/sch_generic.c

index 1efd7d9b25fecdb27ec98e61367b1954d4a72c78..2399073c3afc603bd303960302cf42f5bb38044a 100644 (file)
@@ -95,6 +95,29 @@ TRACE_EVENT(net_dev_xmit,
                __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
 );
 
+TRACE_EVENT(net_dev_xmit_timeout,
+
+       TP_PROTO(struct net_device *dev,
+                int queue_index),
+
+       TP_ARGS(dev, queue_index),
+
+       TP_STRUCT__entry(
+               __string(       name,           dev->name       )
+               __string(       driver,         netdev_drivername(dev))
+               __field(        int,            queue_index     )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, dev->name);
+               __assign_str(driver, netdev_drivername(dev));
+               __entry->queue_index = queue_index;
+       ),
+
+       TP_printk("dev=%s driver=%s queue=%d",
+               __get_str(name), __get_str(driver), __entry->queue_index)
+);
+
 DECLARE_EVENT_CLASS(net_dev_template,
 
        TP_PROTO(struct sk_buff *skb),
index 848aab3693bd075613555515202a1203da476a80..cce1e9ee85af04109068f1e3025f587674864ffa 100644 (file)
@@ -32,6 +32,7 @@
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 #include <trace/events/qdisc.h>
+#include <trace/events/net.h>
 #include <net/xfrm.h>
 
 /* Qdisc to use by default */
@@ -441,6 +442,7 @@ static void dev_watchdog(struct timer_list *t)
                        }
 
                        if (some_queue_timedout) {
+                               trace_net_dev_xmit_timeout(dev, i);
                                WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
                                       dev->name, netdev_drivername(dev), i);
                                dev->netdev_ops->ndo_tx_timeout(dev);