rds: Extend RDS API for IPv6 support
authorKa-Cheong Poon <ka-cheong.poon@oracle.com>
Tue, 24 Jul 2018 03:51:23 +0000 (20:51 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 24 Jul 2018 04:17:44 +0000 (21:17 -0700)
There are many data structures (RDS socket options) used by RDS apps
which use a 32 bit integer to store IP address. To support IPv6,
struct in6_addr needs to be used. To ensure backward compatibility, a
new data structure is introduced for each of those data structures
which use a 32 bit integer to represent an IP address. And new socket
options are introduced to use those new structures. This means that
existing apps should work without a problem with the new RDS module.
For apps which want to use IPv6, those new data structures and socket
options can be used. IPv4 mapped address is used to represent IPv4
address in the new data structures.

v4: Revert changes to SO_RDS_TRANSPORT

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/uapi/linux/rds.h
net/rds/connection.c
net/rds/ib.c
net/rds/ib_mr.h
net/rds/ib_rdma.c
net/rds/recv.c
net/rds/tcp.c

index 20c6bd0b00079e9edd199cc1c138c28d3129fc46..dc520e1a4123f7a60d4996b5d0df7d237199cdba 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
 /*
- * Copyright (c) 2008 Oracle.  All rights reserved.
+ * Copyright (c) 2008, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
 #define RDS_INFO_IB_CONNECTIONS                10008
 #define RDS_INFO_CONNECTION_STATS      10009
 #define RDS_INFO_IWARP_CONNECTIONS     10010
-#define RDS_INFO_LAST                  10010
+
+/* PF_RDS6 options */
+#define RDS6_INFO_CONNECTIONS          10011
+#define RDS6_INFO_SEND_MESSAGES                10012
+#define RDS6_INFO_RETRANS_MESSAGES     10013
+#define RDS6_INFO_RECV_MESSAGES                10014
+#define RDS6_INFO_SOCKETS              10015
+#define RDS6_INFO_TCP_SOCKETS          10016
+#define RDS6_INFO_IB_CONNECTIONS       10017
+
+#define RDS_INFO_LAST                  10017
 
 struct rds_info_counter {
        __u8    name[32];
@@ -140,6 +150,15 @@ struct rds_info_connection {
        __u8            flags;
 } __attribute__((packed));
 
+struct rds6_info_connection {
+       __u64           next_tx_seq;
+       __u64           next_rx_seq;
+       struct in6_addr laddr;
+       struct in6_addr faddr;
+       __u8            transport[TRANSNAMSIZ];         /* null term ascii */
+       __u8            flags;
+} __attribute__((packed));
+
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
@@ -153,6 +172,17 @@ struct rds_info_message {
        __u8            flags;
 } __attribute__((packed));
 
+struct rds6_info_message {
+       __u64   seq;
+       __u32   len;
+       struct in6_addr laddr;
+       struct in6_addr faddr;
+       __be16          lport;
+       __be16          fport;
+       __u8            flags;
+       __u8            tos;
+} __attribute__((packed));
+
 struct rds_info_socket {
        __u32           sndbuf;
        __be32          bound_addr;
@@ -163,6 +193,16 @@ struct rds_info_socket {
        __u64           inum;
 } __attribute__((packed));
 
+struct rds6_info_socket {
+       __u32           sndbuf;
+       struct in6_addr bound_addr;
+       struct in6_addr connected_addr;
+       __be16          bound_port;
+       __be16          connected_port;
+       __u32           rcvbuf;
+       __u64           inum;
+} __attribute__((packed));
+
 struct rds_info_tcp_socket {
        __be32          local_addr;
        __be16          local_port;
@@ -175,6 +215,18 @@ struct rds_info_tcp_socket {
        __u32           last_seen_una;
 } __attribute__((packed));
 
+struct rds6_info_tcp_socket {
+       struct in6_addr local_addr;
+       __be16          local_port;
+       struct in6_addr peer_addr;
+       __be16          peer_port;
+       __u64           hdr_rem;
+       __u64           data_rem;
+       __u32           last_sent_nxt;
+       __u32           last_expected_una;
+       __u32           last_seen_una;
+} __attribute__((packed));
+
 #define RDS_IB_GID_LEN 16
 struct rds_info_rdma_connection {
        __be32          src_addr;
@@ -189,6 +241,19 @@ struct rds_info_rdma_connection {
        __u32           rdma_mr_size;
 };
 
+struct rds6_info_rdma_connection {
+       struct in6_addr src_addr;
+       struct in6_addr dst_addr;
+       __u8            src_gid[RDS_IB_GID_LEN];
+       __u8            dst_gid[RDS_IB_GID_LEN];
+
+       __u32           max_send_wr;
+       __u32           max_recv_wr;
+       __u32           max_send_sge;
+       __u32           rdma_mr_max;
+       __u32           rdma_mr_size;
+};
+
 /* RDS message Receive Path Latency points */
 enum rds_message_rxpath_latency {
        RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
index 5c9ceed55dae1298bf68d3ec326d647d5869fe71..051e35c1e7c65a76eea0e57220f106b645cd1c24 100644 (file)
@@ -498,16 +498,19 @@ EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
 static void __rds_inc_msg_cp(struct rds_incoming *inc,
                             struct rds_info_iterator *iter,
-                            void *saddr, void *daddr, int flip)
+                            void *saddr, void *daddr, int flip, bool isv6)
 {
-       rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
-                         *(__be32 *)daddr, flip);
+       if (isv6)
+               rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+       else
+               rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+                                 *(__be32 *)daddr, flip);
 }
 
 static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
                                      struct rds_info_iterator *iter,
                                      struct rds_info_lengths *lens,
-                                     int want_send)
+                                     int want_send, bool isv6)
 {
        struct hlist_head *head;
        struct list_head *list;
@@ -518,7 +521,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
        size_t i;
        int j;
 
-       len /= sizeof(struct rds_info_message);
+       if (isv6)
+               len /= sizeof(struct rds6_info_message);
+       else
+               len /= sizeof(struct rds_info_message);
 
        rcu_read_lock();
 
@@ -528,6 +534,9 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
                        struct rds_conn_path *cp;
                        int npaths;
 
+                       if (!isv6 && conn->c_isv6)
+                               continue;
+
                        npaths = (conn->c_trans->t_mp_capable ?
                                 RDS_MPATH_WORKERS : 1);
 
@@ -548,7 +557,7 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
                                                                 iter,
                                                                 &conn->c_laddr,
                                                                 &conn->c_faddr,
-                                                                0);
+                                                                0, isv6);
                                }
 
                                spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -558,7 +567,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
        rcu_read_unlock();
 
        lens->nr = total;
-       lens->each = sizeof(struct rds_info_message);
+       if (isv6)
+               lens->each = sizeof(struct rds6_info_message);
+       else
+               lens->each = sizeof(struct rds_info_message);
 }
 
 static void rds_conn_message_info(struct socket *sock, unsigned int len,
@@ -566,7 +578,15 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                  struct rds_info_lengths *lens,
                                  int want_send)
 {
-       rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
+       rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+                                  struct rds_info_iterator *iter,
+                                  struct rds_info_lengths *lens,
+                                  int want_send)
+{
+       rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
 }
 
 static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
@@ -576,6 +596,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
        rds_conn_message_info(sock, len, iter, lens, 1);
 }
 
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+                                       struct rds_info_iterator *iter,
+                                       struct rds_info_lengths *lens)
+{
+       rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+
 static void rds_conn_message_info_retrans(struct socket *sock,
                                          unsigned int len,
                                          struct rds_info_iterator *iter,
@@ -584,6 +611,14 @@ static void rds_conn_message_info_retrans(struct socket *sock,
        rds_conn_message_info(sock, len, iter, lens, 0);
 }
 
+static void rds6_conn_message_info_retrans(struct socket *sock,
+                                          unsigned int len,
+                                          struct rds_info_iterator *iter,
+                                          struct rds_info_lengths *lens)
+{
+       rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens,
@@ -699,6 +734,34 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
        return 1;
 }
 
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+       struct rds6_info_connection *cinfo6 = buffer;
+       struct rds_connection *conn = cp->cp_conn;
+
+       cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+       cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+       cinfo6->laddr = conn->c_laddr;
+       cinfo6->faddr = conn->c_faddr;
+       strncpy(cinfo6->transport, conn->c_trans->t_name,
+               sizeof(cinfo6->transport));
+       cinfo6->flags = 0;
+
+       rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+                         SENDING);
+       /* XXX Future: return the state rather than these funky bits */
+       rds_conn_info_set(cinfo6->flags,
+                         atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+                         CONNECTING);
+       rds_conn_info_set(cinfo6->flags,
+                         atomic_read(&cp->cp_state) == RDS_CONN_UP,
+                         CONNECTED);
+       /* Just return 1 as there is no error case. This is a helper function
+        * for rds_walk_conn_path_info() and it wants a return value.
+        */
+       return 1;
+}
+
 static void rds_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens)
@@ -711,6 +774,18 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_connection));
 }
 
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens)
+{
+       u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+       rds_walk_conn_path_info(sock, len, iter, lens,
+                               rds6_conn_info_visitor,
+                               buffer,
+                               sizeof(struct rds6_info_connection));
+}
+
 int rds_conn_init(void)
 {
        int ret;
@@ -732,6 +807,11 @@ int rds_conn_init(void)
                               rds_conn_message_info_send);
        rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
                               rds_conn_message_info_retrans);
+       rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+       rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+                              rds6_conn_message_info_send);
+       rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+                              rds6_conn_message_info_retrans);
 
        return 0;
 }
@@ -750,6 +830,11 @@ void rds_conn_exit(void)
                                 rds_conn_message_info_send);
        rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
                                 rds_conn_message_info_retrans);
+       rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+       rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+                                rds6_conn_message_info_send);
+       rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+                                rds6_conn_message_info_retrans);
 }
 
 /*
index 756225c5540faf2bcb93cbbbedd049bde75dae43..63d95ea7cdff9e5f6677b95ba7f2e4d9723b4bc7 100644 (file)
@@ -321,6 +321,43 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
        return 1;
 }
 
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+                                    void *buffer)
+{
+       struct rds6_info_rdma_connection *iinfo6 = buffer;
+       struct rds_ib_connection *ic;
+
+       /* We will only ever look at IB transports */
+       if (conn->c_trans != &rds_ib_transport)
+               return 0;
+
+       iinfo6->src_addr = conn->c_laddr;
+       iinfo6->dst_addr = conn->c_faddr;
+
+       memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+       memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+       if (rds_conn_state(conn) == RDS_CONN_UP) {
+               struct rds_ib_device *rds_ibdev;
+               struct rdma_dev_addr *dev_addr;
+
+               ic = conn->c_transport_data;
+               dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+               rdma_addr_get_sgid(dev_addr,
+                                  (union ib_gid *)&iinfo6->src_gid);
+               rdma_addr_get_dgid(dev_addr,
+                                  (union ib_gid *)&iinfo6->dst_gid);
+
+               rds_ibdev = ic->rds_ibdev;
+               iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+               iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+               iinfo6->max_send_sge = rds_ibdev->max_sge;
+               rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+       }
+       return 1;
+}
+
 static void rds_ib_ic_info(struct socket *sock, unsigned int len,
                           struct rds_info_iterator *iter,
                           struct rds_info_lengths *lens)
@@ -333,6 +370,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_rdma_connection));
 }
 
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+                           struct rds_info_iterator *iter,
+                           struct rds_info_lengths *lens)
+{
+       u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+       rds_for_each_conn_info(sock, len, iter, lens,
+                              rds6_ib_conn_info_visitor,
+                              buffer,
+                              sizeof(struct rds6_info_rdma_connection));
+}
+
 /*
  * Early RDS/IB was built to only bind to an address if there is an IPoIB
  * device with that address set.
@@ -441,6 +491,7 @@ void rds_ib_exit(void)
        rds_ib_set_unloading();
        synchronize_rcu();
        rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+       rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
        rds_ib_unregister_client();
        rds_ib_destroy_nodev_conns();
        rds_ib_sysctl_exit();
@@ -502,6 +553,7 @@ int rds_ib_init(void)
        rds_trans_register(&rds_ib_transport);
 
        rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+       rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 
        goto out;
 
index 0ea4ab017a8cc3f807931e1194cddb5048a82956..f440ace584c8f363639171bab2d986d1dc24043b 100644 (file)
@@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
                                             int npages);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
                        struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+                        struct rds6_info_rdma_connection *iinfo6);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                    struct rds_sock *rs, u32 *key_ret);
index 0ec9df043dd08f0e4fa231c86061da7ecd72ccd3..e3c8bbbdb43f98f19b639319b66596e1cffb85b0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -180,6 +180,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
        iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
 }
 
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+                        struct rds6_info_rdma_connection *iinfo6)
+{
+       struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+       iinfo6->rdma_mr_max = pool_1m->max_items;
+       iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+
 struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
        struct rds_ib_mr *ibmr = NULL;
index 1402c21210b1079a6374acc51b3fe00fba0d0889..03cd8df54c266fb43f186a7052324ab1f30e577a 100644 (file)
@@ -792,3 +792,28 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 
        rds_info_copy(iter, &minfo, sizeof(minfo));
 }
+
+void rds6_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       struct in6_addr *saddr, struct in6_addr *daddr,
+                       int flip)
+{
+       struct rds6_info_message minfo6;
+
+       minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+       minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+       if (flip) {
+               minfo6.laddr = *daddr;
+               minfo6.faddr = *saddr;
+               minfo6.lport = inc->i_hdr.h_dport;
+               minfo6.fport = inc->i_hdr.h_sport;
+       } else {
+               minfo6.laddr = *saddr;
+               minfo6.faddr = *daddr;
+               minfo6.lport = inc->i_hdr.h_sport;
+               minfo6.fport = inc->i_hdr.h_dport;
+       }
+
+       rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
index 890d0e1d8908cdcb4444165e2f609fb6a1698bec..7028d6e51947149324bf2976ac75ed5ae2fb0d06 100644 (file)
@@ -273,6 +273,48 @@ out:
        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+                            struct rds_info_iterator *iter,
+                            struct rds_info_lengths *lens)
+{
+       struct rds6_info_tcp_socket tsinfo6;
+       struct rds_tcp_connection *tc;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+       if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+               goto out;
+
+       list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+               struct sock *sk = tc->t_sock->sk;
+               struct inet_sock *inet = inet_sk(sk);
+
+               tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+               tsinfo6.local_port = inet->inet_sport;
+               tsinfo6.peer_addr = sk->sk_v6_daddr;
+               tsinfo6.peer_port = inet->inet_dport;
+
+               tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+               tsinfo6.data_rem = tc->t_tinc_data_rem;
+               tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+               tsinfo6.last_expected_una = tc->t_last_expected_una;
+               tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+               rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+       }
+
+out:
+       lens->nr = rds6_tcp_tc_count;
+       lens->each = sizeof(tsinfo6);
+
+       spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
 static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
                               __u32 scope_id)
 {
@@ -628,6 +670,7 @@ static void rds_tcp_exit(void)
        rds_tcp_set_unloading();
        synchronize_rcu();
        rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+       rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
        unregister_pernet_device(&rds_tcp_net_ops);
        rds_tcp_destroy_conns();
        rds_trans_unregister(&rds_tcp_transport);
@@ -659,6 +702,7 @@ static int rds_tcp_init(void)
        rds_trans_register(&rds_tcp_transport);
 
        rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+       rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 
        goto out;
 out_recv: