1 From d9962b0d42029bcb40fe3c38bce06d1870fa4df4 Mon Sep 17 00:00:00 2001
2 From: Douglas Anderson <dianders@chromium.org>
3 Date: Fri, 20 Oct 2023 14:06:59 -0700
4 Subject: [PATCH] r8152: Block future register access if register access fails
6 Even though the functions to read/write registers can fail, most of
7 the places in the r8152 driver that read/write register values don't
8 check error codes. The lack of error code checking is problematic in
11 The first problem is that the r8152 driver often uses code patterns
17 ...with the above pattern, if the read_register() fails and returns
18 garbage then we'll end up trying to write modified garbage back to the
19 Realtek adapter. If the write_register() succeeds that's bad. Note
20 that as of commit f53a7ad18959 ("r8152: Set memory to all 0xFFs on
21 failed reg reads") the "garbage" returned by read_register() will at
22 least be consistent garbage, but it is still garbage.
24 It turns out that this problem is very serious. Writing garbage to
25 some of the hardware registers on the Ethernet adapter can put the
26 adapter in such a bad state that it needs to be power cycled (fully
27 unplugged and plugged in again) before it can enumerate again.
29 The second problem is that the r8152 driver generally has functions
30 that are long sequences of register writes. Assuming everything will
31 be OK if a random register write fails in the middle isn't a great
34 One might wonder if the above two problems are real. You could ask if
35 we would really have a successful write after a failed read. It turns
36 out that the answer appears to be "yes, this can happen". In fact,
37 we've seen at least two distinct failure modes where this happens.
39 On a sc7180-trogdor Chromebook if you drop into kdb for a while and
40 then resume, you can see:
41 1. We get a "Tx timeout"
42 2. The "Tx timeout" queues up a USB reset.
43 3. In rtl8152_pre_reset() we try to reinit the hardware.
44 4. The first several (2-9) register accesses fail with a timeout, then
47 The above test case was actually fixed by the patch ("r8152: Increase
48 USB control msg timeout to 5000ms as per spec") but at least shows
49 that we really can see successful calls after failed ones.
51 On a different (AMD) based Chromebook with a particular adapter, we
52 found that during reboot tests we'd also sometimes get a transitory
53 failure. In this case we saw -EPIPE being returned sometimes. Retrying
54 worked, but retrying is not always safe for all register accesses
55 since reading/writing some registers might have side effects (like
56 registers that clear on read).
58 Let's fully lock out all register access if a register access fails.
59 When we do this, we'll try to queue up a USB reset and try to unlock
60 register access after the reset. This is slightly tricker than it
61 sounds since the r8152 driver has an optimized reset sequence that
62 only works reliably after probe happens. In order to handle this, we
63 avoid the optimized reset if probe didn't finish. Instead, we simply
64 retry the probe routine in this case.
66 When locking out access, we'll use the existing infrastructure that
67 the driver was using when it detected we were unplugged. This keeps us
68 from getting stuck in delay loops in some parts of the driver.
70 Signed-off-by: Douglas Anderson <dianders@chromium.org>
71 Reviewed-by: Grant Grundler <grundler@chromium.org>
72 Signed-off-by: David S. Miller <davem@davemloft.net>
74 drivers/net/usb/r8152.c | 207 ++++++++++++++++++++++++++++++++++------
75 1 file changed, 176 insertions(+), 31 deletions(-)
77 --- a/drivers/net/usb/r8152.c
78 +++ b/drivers/net/usb/r8152.c
79 @@ -772,6 +772,9 @@ enum rtl8152_flags {
84 + PROBED_WITH_NO_ERRORS,
88 #define DEVICE_ID_LENOVO_USB_C_TRAVEL_HUB 0x721e
89 @@ -952,6 +955,8 @@ struct r8152 {
94 + unsigned int reg_access_reset_count;
98 @@ -1199,6 +1204,96 @@ static unsigned int agg_buf_sz = 16384;
100 #define RTL_LIMITED_TSO_SIZE (size_to_mtu(agg_buf_sz) - sizeof(struct tx_desc))
102 +/* If register access fails then we block access and issue a reset. If this
103 + * happens too many times in a row without a successful access then we stop
104 + * trying to reset and just leave access blocked.
106 +#define REGISTER_ACCESS_MAX_RESETS 3
108 +static void rtl_set_inaccessible(struct r8152 *tp)
110 + set_bit(RTL8152_INACCESSIBLE, &tp->flags);
111 + smp_mb__after_atomic();
114 +static void rtl_set_accessible(struct r8152 *tp)
116 + clear_bit(RTL8152_INACCESSIBLE, &tp->flags);
117 + smp_mb__after_atomic();
121 +int r8152_control_msg(struct r8152 *tp, unsigned int pipe, __u8 request,
122 + __u8 requesttype, __u16 value, __u16 index, void *data,
123 + __u16 size, const char *msg_tag)
125 + struct usb_device *udev = tp->udev;
128 + if (test_bit(RTL8152_INACCESSIBLE, &tp->flags))
131 + ret = usb_control_msg(udev, pipe, request, requesttype,
132 + value, index, data, size,
133 + USB_CTRL_GET_TIMEOUT);
135 + /* No need to issue a reset to report an error if the USB device got
136 + * unplugged; just return immediately.
138 + if (ret == -ENODEV)
141 + /* If the write was successful then we're done */
143 + tp->reg_access_reset_count = 0;
147 + dev_err(&udev->dev,
148 + "Failed to %s %d bytes at %#06x/%#06x (%d)\n",
149 + msg_tag, size, value, index, ret);
151 + /* Block all future register access until we reset. Much of the code
152 + * in the driver doesn't check for errors. Notably, many parts of the
153 + * driver do a read/modify/write of a register value without
154 + * confirming that the read succeeded. Writing back modified garbage
155 + * like this can fully wedge the adapter, requiring a power cycle.
157 + rtl_set_inaccessible(tp);
159 + /* If probe hasn't yet finished, then we'll request a retry of the
160 + * whole probe routine if we get any control transfer errors. We
161 + * never have to clear this bit since we free/reallocate the whole "tp"
162 + * structure if we retry probe.
164 + if (!test_bit(PROBED_WITH_NO_ERRORS, &tp->flags)) {
165 + set_bit(PROBE_SHOULD_RETRY, &tp->flags);
169 + /* Failing to access registers in pre-reset is not surprising since we
170 + * wouldn't be resetting if things were behaving normally. The register
171 + * access we do in pre-reset isn't truly mandatory--we're just reusing
172 + * the disable() function and trying to be nice by powering the
173 + * adapter down before resetting it. Thus, if we're in pre-reset,
174 + * we'll return right away and not try to queue up yet another reset.
175 + * We know the post-reset is already coming.
177 + if (test_bit(IN_PRE_RESET, &tp->flags))
180 + if (tp->reg_access_reset_count < REGISTER_ACCESS_MAX_RESETS) {
181 + usb_queue_reset_device(tp->intf);
182 + tp->reg_access_reset_count++;
183 + } else if (tp->reg_access_reset_count == REGISTER_ACCESS_MAX_RESETS) {
184 + dev_err(&udev->dev,
185 + "Tried to reset %d times; giving up.\n",
186 + REGISTER_ACCESS_MAX_RESETS);
193 int get_registers(struct r8152 *tp, u16 value, u16 index, u16 size, void *data)
195 @@ -1209,9 +1304,10 @@ int get_registers(struct r8152 *tp, u16
199 - ret = usb_control_msg(tp->udev, tp->pipe_ctrl_in,
200 - RTL8152_REQ_GET_REGS, RTL8152_REQT_READ,
201 - value, index, tmp, size, USB_CTRL_GET_TIMEOUT);
202 + ret = r8152_control_msg(tp, tp->pipe_ctrl_in,
203 + RTL8152_REQ_GET_REGS, RTL8152_REQT_READ,
204 + value, index, tmp, size, "read");
207 memset(data, 0xff, size);
209 @@ -1232,9 +1328,9 @@ int set_registers(struct r8152 *tp, u16
213 - ret = usb_control_msg(tp->udev, tp->pipe_ctrl_out,
214 - RTL8152_REQ_SET_REGS, RTL8152_REQT_WRITE,
215 - value, index, tmp, size, USB_CTRL_SET_TIMEOUT);
216 + ret = r8152_control_msg(tp, tp->pipe_ctrl_out,
217 + RTL8152_REQ_SET_REGS, RTL8152_REQT_WRITE,
218 + value, index, tmp, size, "write");
222 @@ -1243,10 +1339,8 @@ int set_registers(struct r8152 *tp, u16
224 static void rtl_set_unplug(struct r8152 *tp)
226 - if (tp->udev->state == USB_STATE_NOTATTACHED) {
227 - set_bit(RTL8152_INACCESSIBLE, &tp->flags);
228 - smp_mb__after_atomic();
230 + if (tp->udev->state == USB_STATE_NOTATTACHED)
231 + rtl_set_inaccessible(tp);
234 static int generic_ocp_read(struct r8152 *tp, u16 index, u16 size,
235 @@ -8261,7 +8355,7 @@ static int rtl8152_pre_reset(struct usb_
236 struct r8152 *tp = usb_get_intfdata(intf);
237 struct net_device *netdev;
240 + if (!tp || !test_bit(PROBED_WITH_NO_ERRORS, &tp->flags))
244 @@ -8276,7 +8370,9 @@ static int rtl8152_pre_reset(struct usb_
245 napi_disable(&tp->napi);
246 if (netif_carrier_ok(netdev)) {
247 mutex_lock(&tp->control);
248 + set_bit(IN_PRE_RESET, &tp->flags);
249 tp->rtl_ops.disable(tp);
250 + clear_bit(IN_PRE_RESET, &tp->flags);
251 mutex_unlock(&tp->control);
254 @@ -8289,9 +8385,11 @@ static int rtl8152_post_reset(struct usb
255 struct net_device *netdev;
259 + if (!tp || !test_bit(PROBED_WITH_NO_ERRORS, &tp->flags))
262 + rtl_set_accessible(tp);
264 /* reset the MAC address in case of policy change */
265 if (determine_ethernet_addr(tp, &sa) >= 0) {
267 @@ -9493,17 +9591,29 @@ static u8 __rtl_get_hw_ver(struct usb_de
273 tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
277 - ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
278 - RTL8152_REQ_GET_REGS, RTL8152_REQT_READ,
279 - PLA_TCR0, MCU_TYPE_PLA, tmp, sizeof(*tmp),
280 - USB_CTRL_GET_TIMEOUT);
282 - ocp_data = (__le32_to_cpu(*tmp) >> 16) & VERSION_MASK;
283 + /* Retry up to 3 times in case there is a transitory error. We do this
284 + * since retrying a read of the version is always safe and this
285 + * function doesn't take advantage of r8152_control_msg().
287 + for (i = 0; i < 3; i++) {
288 + ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
289 + RTL8152_REQ_GET_REGS, RTL8152_REQT_READ,
290 + PLA_TCR0, MCU_TYPE_PLA, tmp, sizeof(*tmp),
291 + USB_CTRL_GET_TIMEOUT);
293 + ocp_data = (__le32_to_cpu(*tmp) >> 16) & VERSION_MASK;
298 + if (i != 0 && ret > 0)
299 + dev_warn(&udev->dev, "Needed %d retries to read version\n", i);
303 @@ -9602,25 +9712,14 @@ static bool rtl8152_supports_lenovo_macp
307 -static int rtl8152_probe(struct usb_interface *intf,
308 - const struct usb_device_id *id)
309 +static int rtl8152_probe_once(struct usb_interface *intf,
310 + const struct usb_device_id *id, u8 version)
312 struct usb_device *udev = interface_to_usbdev(intf);
314 struct net_device *netdev;
318 - if (intf->cur_altsetting->desc.bInterfaceClass != USB_CLASS_VENDOR_SPEC)
321 - if (!rtl_check_vendor_ok(intf))
324 - version = rtl8152_get_version(intf);
325 - if (version == RTL_VER_UNKNOWN)
328 usb_reset_device(udev);
329 netdev = alloc_etherdev(sizeof(struct r8152));
331 @@ -9783,10 +9882,20 @@ static int rtl8152_probe(struct usb_inte
333 device_set_wakeup_enable(&udev->dev, false);
335 + /* If we saw a control transfer error while probing then we may
336 + * want to try probe() again. Consider this an error.
338 + if (test_bit(PROBE_SHOULD_RETRY, &tp->flags))
341 + set_bit(PROBED_WITH_NO_ERRORS, &tp->flags);
342 netif_info(tp, probe, netdev, "%s\n", DRIVER_VERSION);
347 + unregister_netdev(netdev);
350 tasklet_kill(&tp->tx_tl);
351 cancel_delayed_work_sync(&tp->hw_phy_work);
352 @@ -9795,10 +9904,46 @@ out1:
353 rtl8152_release_firmware(tp);
354 usb_set_intfdata(intf, NULL);
356 + if (test_bit(PROBE_SHOULD_RETRY, &tp->flags))
363 +#define RTL8152_PROBE_TRIES 3
365 +static int rtl8152_probe(struct usb_interface *intf,
366 + const struct usb_device_id *id)
372 + if (intf->cur_altsetting->desc.bInterfaceClass != USB_CLASS_VENDOR_SPEC)
375 + if (!rtl_check_vendor_ok(intf))
378 + version = rtl8152_get_version(intf);
379 + if (version == RTL_VER_UNKNOWN)
382 + for (i = 0; i < RTL8152_PROBE_TRIES; i++) {
383 + ret = rtl8152_probe_once(intf, id, version);
384 + if (ret != -EAGAIN)
387 + if (ret == -EAGAIN) {
388 + dev_err(&intf->dev,
389 + "r8152 failed probe after %d tries; giving up\n", i);
396 static void rtl8152_disconnect(struct usb_interface *intf)
398 struct r8152 *tp = usb_get_intfdata(intf);