From d5fbda61ac923e0adb89fd59fdf4a1d99406b86e Mon Sep 17 00:00:00 2001 From: Arjun Vynipadath Date: Tue, 26 Jun 2018 17:10:25 +0530 Subject: [PATCH] cxgb4: Add support for FW_ETH_TX_PKT_VM_WR The present TX workrequest(FW_ETH_TX_PKT_WR) cant be used for host->vf communication, since it doesn't loopback the outgoing packets to virtual interfaces on the same port. This can be done using FW_ETH_TX_PKT_VM_WR. This fix depends on ethtool_flags to determine what WR to use for TX path. Support for setting this flags by user is added in next commit. Based on the original work by : Casey Leedom Signed-off-by: Casey Leedom Signed-off-by: Arjun Vynipadath Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 13 +- .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/sge.c | 372 +++++++++++++++++- 3 files changed, 383 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index f27b2f0ade0b..4a8cbd864ef7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -522,6 +522,15 @@ enum { MAX_INGQ = MAX_ETH_QSETS + INGQ_EXTRAS, }; +enum { + PRIV_FLAG_PORT_TX_VM_BIT, +}; + +#define PRIV_FLAG_PORT_TX_VM BIT(PRIV_FLAG_PORT_TX_VM_BIT) + +#define PRIV_FLAGS_ADAP 0 +#define PRIV_FLAGS_PORT PRIV_FLAG_PORT_TX_VM + struct adapter; struct sge_rspq; @@ -558,6 +567,7 @@ struct port_info { struct hwtstamp_config tstamp_config; bool ptp_enable; struct sched_table *sched_tbl; + u32 eth_flags; }; struct dentry; @@ -868,6 +878,7 @@ struct adapter { unsigned int flags; unsigned int adap_idx; enum chip_type chip; + u32 eth_flags; int msg_enable; __be16 vxlan_port; @@ -1335,7 +1346,7 @@ void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat); void t4_free_sge_resources(struct adapter *adap); void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q); irq_handler_t t4_intr_handler(struct adapter *adap); -netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev); +netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev); int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, const struct pkt_gl *gl); int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 96bc177d54de..1c0374cbb890 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3217,7 +3217,7 @@ static netdev_features_t cxgb_fix_features(struct net_device *dev, static const struct net_device_ops cxgb4_netdev_ops = { .ndo_open = cxgb_open, .ndo_stop = cxgb_close, - .ndo_start_xmit = t4_eth_xmit, + .ndo_start_xmit = t4_start_xmit, .ndo_select_queue = cxgb_select_queue, .ndo_get_stats64 = cxgb_get_stats, .ndo_set_rx_mode = cxgb_set_rxmode, diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 395e2a0e8d7f..ebb46c472d52 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1288,13 +1288,13 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb, } /** - * t4_eth_xmit - add a packet to an Ethernet Tx queue + * cxgb4_eth_xmit - add a packet to an Ethernet Tx queue * @skb: the packet * @dev: the egress net device * * Add a packet to an SGE Ethernet Tx queue. Runs with softirqs disabled. */ -netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) { u32 wr_mid, ctrl0, op; u64 cntrl, *end, *sgl; @@ -1547,6 +1547,374 @@ out_free: dev_kfree_skb_any(skb); return NETDEV_TX_OK; } +/* Constants ... */ +enum { + /* Egress Queue sizes, producer and consumer indices are all in units + * of Egress Context Units bytes. Note that as far as the hardware is + * concerned, the free list is an Egress Queue (the host produces free + * buffers which the hardware consumes) and free list entries are + * 64-bit PCI DMA addresses. + */ + EQ_UNIT = SGE_EQ_IDXSIZE, + FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64), + TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64), + + T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) + + sizeof(struct cpl_tx_pkt_lso_core) + + sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64), +}; + +/** + * t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data? + * @skb: the packet + * + * Returns whether an Ethernet packet is small enough to fit completely as + * immediate data. + */ +static inline int t4vf_is_eth_imm(const struct sk_buff *skb) +{ + /* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request + * which does not accommodate immediate data. We could dike out all + * of the support code for immediate data but that would tie our hands + * too much if we ever want to enhace the firmware. It would also + * create more differences between the PF and VF Drivers. + */ + return false; +} + +/** + * t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR + * @skb: the packet + * + * Returns the number of flits needed for a TX Work Request for the + * given Ethernet packet, including the needed WR and CPL headers. + */ +static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb) +{ + unsigned int flits; + + /* If the skb is small enough, we can pump it out as a work request + * with only immediate data. In that case we just have to have the + * TX Packet header plus the skb data in the Work Request. + */ + if (t4vf_is_eth_imm(skb)) + return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt), + sizeof(__be64)); + + /* Otherwise, we're going to have to construct a Scatter gather list + * of the skb body and fragments. We also include the flits necessary + * for the TX Packet Work Request and CPL. We always have a firmware + * Write Header (incorporated as part of the cpl_tx_pkt_lso and + * cpl_tx_pkt structures), followed by either a TX Packet Write CPL + * message or, if we're doing a Large Send Offload, an LSO CPL message + * with an embedded TX Packet Write CPL message. + */ + flits = sgl_len(skb_shinfo(skb)->nr_frags + 1); + if (skb_shinfo(skb)->gso_size) + flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) + + sizeof(struct cpl_tx_pkt_lso_core) + + sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64); + else + flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) + + sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64); + return flits; +} + +/** + * cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue + * @skb: the packet + * @dev: the egress net device + * + * Add a packet to an SGE Ethernet TX queue. Runs with softirqs disabled. + */ +static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + dma_addr_t addr[MAX_SKB_FRAGS + 1]; + const struct skb_shared_info *ssi; + struct fw_eth_tx_pkt_vm_wr *wr; + int qidx, credits, max_pkt_len; + struct cpl_tx_pkt_core *cpl; + const struct port_info *pi; + unsigned int flits, ndesc; + struct sge_eth_txq *txq; + struct adapter *adapter; + u64 cntrl, *end; + u32 wr_mid; + const size_t fw_hdr_copy_len = sizeof(wr->ethmacdst) + + sizeof(wr->ethmacsrc) + + sizeof(wr->ethtype) + + sizeof(wr->vlantci); + + /* The chip minimum packet length is 10 octets but the firmware + * command that we are using requires that we copy the Ethernet header + * (including the VLAN tag) into the header so we reject anything + * smaller than that ... + */ + if (unlikely(skb->len < fw_hdr_copy_len)) + goto out_free; + + /* Discard the packet if the length is greater than mtu */ + max_pkt_len = ETH_HLEN + dev->mtu; + if (skb_vlan_tag_present(skb)) + max_pkt_len += VLAN_HLEN; + if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len))) + goto out_free; + + /* Figure out which TX Queue we're going to use. */ + pi = netdev_priv(dev); + adapter = pi->adapter; + qidx = skb_get_queue_mapping(skb); + WARN_ON(qidx >= pi->nqsets); + txq = &adapter->sge.ethtxq[pi->first_qset + qidx]; + + /* Take this opportunity to reclaim any TX Descriptors whose DMA + * transfers have completed. + */ + cxgb4_reclaim_completed_tx(adapter, &txq->q, true); + + /* Calculate the number of flits and TX Descriptors we're going to + * need along with how many TX Descriptors will be left over after + * we inject our Work Request. + */ + flits = t4vf_calc_tx_flits(skb); + ndesc = flits_to_desc(flits); + credits = txq_avail(&txq->q) - ndesc; + + if (unlikely(credits < 0)) { + /* Not enough room for this packet's Work Request. Stop the + * TX Queue and return a "busy" condition. The queue will get + * started later on when the firmware informs us that space + * has opened up. + */ + eth_txq_stop(txq); + dev_err(adapter->pdev_dev, + "%s: TX ring %u full while queue awake!\n", + dev->name, qidx); + return NETDEV_TX_BUSY; + } + + if (!t4vf_is_eth_imm(skb) && + unlikely(cxgb4_map_skb(adapter->pdev_dev, skb, addr) < 0)) { + /* We need to map the skb into PCI DMA space (because it can't + * be in-lined directly into the Work Request) and the mapping + * operation failed. Record the error and drop the packet. + */ + txq->mapping_err++; + goto out_free; + } + + wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2)); + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + /* After we're done injecting the Work Request for this + * packet, we'll be below our "stop threshold" so stop the TX + * Queue now and schedule a request for an SGE Egress Queue + * Update message. The queue will get started later on when + * the firmware processes this Work Request and sends us an + * Egress Queue Status Update message indicating that space + * has opened up. + */ + eth_txq_stop(txq); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + + /* Start filling in our Work Request. Note that we do _not_ handle + * the WR Header wrapping around the TX Descriptor Ring. If our + * maximum header size ever exceeds one TX Descriptor, we'll need to + * do something else here. + */ + WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1); + wr = (void *)&txq->q.desc[txq->q.pidx]; + wr->equiq_to_len16 = cpu_to_be32(wr_mid); + wr->r3[0] = cpu_to_be32(0); + wr->r3[1] = cpu_to_be32(0); + skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len); + end = (u64 *)wr + flits; + + /* If this is a Large Send Offload packet we'll put in an LSO CPL + * message with an encapsulated TX Packet CPL message. Otherwise we + * just use a TX Packet CPL message. + */ + ssi = skb_shinfo(skb); + if (ssi->gso_size) { + struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); + bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0; + int l3hdr_len = skb_network_header_len(skb); + int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN; + + wr->op_immdlen = + cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) | + FW_WR_IMMDLEN_V(sizeof(*lso) + + sizeof(*cpl))); + /* Fill in the LSO CPL message. */ + lso->lso_ctrl = + cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) | + LSO_FIRST_SLICE_F | + LSO_LAST_SLICE_F | + LSO_IPV6_V(v6) | + LSO_ETHHDR_LEN_V(eth_xtra_len / 4) | + LSO_IPHDR_LEN_V(l3hdr_len / 4) | + LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff)); + lso->ipid_ofst = cpu_to_be16(0); + lso->mss = cpu_to_be16(ssi->gso_size); + lso->seqno_offset = cpu_to_be32(0); + if (is_t4(adapter->params.chip)) + lso->len = cpu_to_be32(skb->len); + else + lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len)); + + /* Set up TX Packet CPL pointer, control word and perform + * accounting. + */ + cpl = (void *)(lso + 1); + + if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) + cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len); + else + cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len); + + cntrl |= TXPKT_CSUM_TYPE_V(v6 ? + TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) | + TXPKT_IPHDR_LEN_V(l3hdr_len); + txq->tso++; + txq->tx_cso += ssi->gso_segs; + } else { + int len; + + len = (t4vf_is_eth_imm(skb) + ? skb->len + sizeof(*cpl) + : sizeof(*cpl)); + wr->op_immdlen = + cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) | + FW_WR_IMMDLEN_V(len)); + + /* Set up TX Packet CPL pointer, control word and perform + * accounting. + */ + cpl = (void *)(wr + 1); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + cntrl = hwcsum(adapter->params.chip, skb) | + TXPKT_IPCSUM_DIS_F; + txq->tx_cso++; + } else { + cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F; + } + } + + /* If there's a VLAN tag present, add that to the list of things to + * do in this Work Request. + */ + if (skb_vlan_tag_present(skb)) { + txq->vlan_ins++; + cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb)); + } + + /* Fill in the TX Packet CPL message header. */ + cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) | + TXPKT_INTF_V(pi->port_id) | + TXPKT_PF_V(0)); + cpl->pack = cpu_to_be16(0); + cpl->len = cpu_to_be16(skb->len); + cpl->ctrl1 = cpu_to_be64(cntrl); + + /* Fill in the body of the TX Packet CPL message with either in-lined + * data or a Scatter/Gather List. + */ + if (t4vf_is_eth_imm(skb)) { + /* In-line the packet's data and free the skb since we don't + * need it any longer. + */ + cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1); + dev_consume_skb_any(skb); + } else { + /* Write the skb's Scatter/Gather list into the TX Packet CPL + * message and retain a pointer to the skb so we can free it + * later when its DMA completes. (We store the skb pointer + * in the Software Descriptor corresponding to the last TX + * Descriptor used by the Work Request.) + * + * The retained skb will be freed when the corresponding TX + * Descriptors are reclaimed after their DMAs complete. + * However, this could take quite a while since, in general, + * the hardware is set up to be lazy about sending DMA + * completion notifications to us and we mostly perform TX + * reclaims in the transmit routine. + * + * This is good for performamce but means that we rely on new + * TX packets arriving to run the destructors of completed + * packets, which open up space in their sockets' send queues. + * Sometimes we do not get such new packets causing TX to + * stall. A single UDP transmitter is a good example of this + * situation. We have a clean up timer that periodically + * reclaims completed packets but it doesn't run often enough + * (nor do we want it to) to prevent lengthy stalls. A + * solution to this problem is to run the destructor early, + * after the packet is queued but before it's DMAd. A con is + * that we lie to socket memory accounting, but the amount of + * extra memory is reasonable (limited by the number of TX + * descriptors), the packets do actually get freed quickly by + * new packets almost always, and for protocols like TCP that + * wait for acks to really free up the data the extra memory + * is even less. On the positive side we run the destructors + * on the sending CPU rather than on a potentially different + * completing CPU, usually a good thing. + * + * Run the destructor before telling the DMA engine about the + * packet to make sure it doesn't complete and get freed + * prematurely. + */ + struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1); + struct sge_txq *tq = &txq->q; + int last_desc; + + /* If the Work Request header was an exact multiple of our TX + * Descriptor length, then it's possible that the starting SGL + * pointer lines up exactly with the end of our TX Descriptor + * ring. If that's the case, wrap around to the beginning + * here ... + */ + if (unlikely((void *)sgl == (void *)tq->stat)) { + sgl = (void *)tq->desc; + end = (void *)((void *)tq->desc + + ((void *)end - (void *)tq->stat)); + } + + cxgb4_write_sgl(skb, tq, sgl, end, 0, addr); + skb_orphan(skb); + + last_desc = tq->pidx + ndesc - 1; + if (last_desc >= tq->size) + last_desc -= tq->size; + tq->sdesc[last_desc].skb = skb; + tq->sdesc[last_desc].sgl = sgl; + } + + /* Advance our internal TX Queue state, tell the hardware about + * the new TX descriptors and return success. + */ + txq_advance(&txq->q, ndesc); + + cxgb4_ring_tx_db(adapter, &txq->q, ndesc); + return NETDEV_TX_OK; + +out_free: + /* An error of some sort happened. Free the TX skb and tell the + * OS that we've "dealt" with the packet ... + */ + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; +} + +netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + + if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM)) + return cxgb4_vf_eth_xmit(skb, dev); + + return cxgb4_eth_xmit(skb, dev); +} + /** * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs * @q: the SGE control Tx queue -- 2.30.2