rds: add type of service(tos) infrastructure
authorSantosh Shilimkar <santosh.shilimkar@oracle.com>
Wed, 24 Oct 2018 03:21:14 +0000 (23:21 -0400)
committerSantosh Shilimkar <santosh.shilimkar@oracle.com>
Mon, 4 Feb 2019 22:59:12 +0000 (14:59 -0800)
RDS Service type (TOS) is user-defined and needs to be configured
via RDS IOCTL interface. It must be set before initiating any
traffic and once set the TOS can not be changed. All out-going
traffic from the socket will be associated with its TOS.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
[yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes]
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
include/uapi/linux/rds.h
net/rds/af_rds.c
net/rds/connection.c
net/rds/ib.c
net/rds/ib_cm.c
net/rds/rdma_transport.c
net/rds/rds.h
net/rds/recv.c
net/rds/send.c
net/rds/tcp.c
net/rds/tcp_listen.c

index 8b73cb603c5f32c78ff4ac78df4e78453a449cd9..5d0f76c780e5f33c9e53e454aa445185f5df6348 100644 (file)
 #define RDS_TRANS_COUNT        3
 #define        RDS_TRANS_NONE  (~0)
 
+/* IOCTLS commands for SOL_RDS */
+#define SIOCRDSSETTOS          (SIOCPROTOPRIVATE)
+#define SIOCRDSGETTOS          (SIOCPROTOPRIVATE + 1)
+
+typedef __u8   rds_tos_t;
+
 /*
  * Control message types for SOL_RDS.
  *
@@ -149,6 +155,7 @@ struct rds_info_connection {
        __be32          faddr;
        __u8            transport[TRANSNAMSIZ];         /* null term ascii */
        __u8            flags;
+       __u8            tos;
 } __attribute__((packed));
 
 struct rds6_info_connection {
@@ -171,6 +178,7 @@ struct rds_info_message {
        __be16          lport;
        __be16          fport;
        __u8            flags;
+       __u8            tos;
 } __attribute__((packed));
 
 struct rds6_info_message {
@@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
        __u32           last_sent_nxt;
        __u32           last_expected_una;
        __u32           last_seen_una;
+       __u8            tos;
 } __attribute__((packed));
 
 struct rds6_info_tcp_socket {
@@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
        __u32           max_send_sge;
        __u32           rdma_mr_max;
        __u32           rdma_mr_size;
+       __u8            tos;
 };
 
 struct rds6_info_rdma_connection {
@@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
        __u32           max_send_sge;
        __u32           rdma_mr_max;
        __u32           rdma_mr_size;
+       __u8            tos;
 };
 
 /* RDS message Receive Path Latency points */
index 65571a6273c320b60bef16c8921fa80fe389648f..9045158580370e0969c29b57cc3963982c769c62 100644 (file)
@@ -254,7 +254,38 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 
 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-       return -ENOIOCTLCMD;
+       struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+       rds_tos_t tos;
+
+       switch (cmd) {
+       case SIOCRDSSETTOS:
+               if (get_user(tos, (rds_tos_t __user *)arg))
+                       return -EFAULT;
+
+               if (rs->rs_transport &&
+                   rs->rs_transport->t_type == RDS_TRANS_TCP)
+                       tos = 0;
+
+               spin_lock_bh(&rds_sock_lock);
+               if (rs->rs_tos || rs->rs_conn) {
+                       spin_unlock_bh(&rds_sock_lock);
+                       return -EINVAL;
+               }
+               rs->rs_tos = tos;
+               spin_unlock_bh(&rds_sock_lock);
+               break;
+       case SIOCRDSGETTOS:
+               spin_lock_bh(&rds_sock_lock);
+               tos = rs->rs_tos;
+               spin_unlock_bh(&rds_sock_lock);
+               if (put_user(tos, (rds_tos_t __user *)arg))
+                       return -EFAULT;
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+
+       return 0;
 }
 
 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
@@ -650,6 +681,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
        spin_lock_init(&rs->rs_rdma_lock);
        rs->rs_rdma_keys = RB_ROOT;
        rs->rs_rx_traces = 0;
+       rs->rs_tos = 0;
+       rs->rs_conn = NULL;
 
        spin_lock_bh(&rds_sock_lock);
        list_add_tail(&rs->rs_item, &rds_sock_list);
index 1ab14b68ecc8ccfb63ef32eee7cffe8051f3db01..7ea134f9a825eb9f136be14190c91218719db455 100644 (file)
@@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
                                              const struct in6_addr *laddr,
                                              const struct in6_addr *faddr,
                                              struct rds_transport *trans,
-                                             int dev_if)
+                                             u8 tos, int dev_if)
 {
        struct rds_connection *conn, *ret = NULL;
 
@@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
                if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
                    ipv6_addr_equal(&conn->c_laddr, laddr) &&
                    conn->c_trans == trans &&
+                   conn->c_tos == tos &&
                    net == rds_conn_net(conn) &&
                    conn->c_dev_if == dev_if) {
                        ret = conn;
@@ -160,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
                                                const struct in6_addr *laddr,
                                                const struct in6_addr *faddr,
                                                struct rds_transport *trans,
-                                               gfp_t gfp,
+                                               gfp_t gfp, u8 tos,
                                                int is_outgoing,
                                                int dev_if)
 {
@@ -172,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
 
        rcu_read_lock();
-       conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+       conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
        if (conn &&
            conn->c_loopback &&
            conn->c_trans != &rds_loop_transport &&
@@ -206,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
        conn->c_faddr = *faddr;
        conn->c_dev_if = dev_if;
+       conn->c_tos = tos;
 
 #if IS_ENABLED(CONFIG_IPV6)
        /* If the local address is link local, set c_bound_if to be the
@@ -298,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
                struct rds_connection *found;
 
                found = rds_conn_lookup(net, head, laddr, faddr, trans,
-                                       dev_if);
+                                       tos, dev_if);
                if (found) {
                        struct rds_conn_path *cp;
                        int i;
@@ -333,10 +335,10 @@ out:
 struct rds_connection *rds_conn_create(struct net *net,
                                       const struct in6_addr *laddr,
                                       const struct in6_addr *faddr,
-                                      struct rds_transport *trans, gfp_t gfp,
-                                      int dev_if)
+                                      struct rds_transport *trans, u8 tos,
+                                      gfp_t gfp, int dev_if)
 {
-       return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
+       return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
@@ -344,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
                                                const struct in6_addr *laddr,
                                                const struct in6_addr *faddr,
                                                struct rds_transport *trans,
-                                               gfp_t gfp, int dev_if)
+                                               u8 tos, gfp_t gfp, int dev_if)
 {
-       return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
+       return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
index 9d7b7586f240669d4c4a5372f618e424f0ce18dd..21b6588b71caa04d0ad1a5c6035ac9c28fb70973 100644 (file)
@@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 
        iinfo->src_addr = conn->c_laddr.s6_addr32[3];
        iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
+       iinfo->tos = conn->c_tos;
 
        memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
        memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
index a1c3ad380ec81fb43b9900dc4dbd9fff7e37cad1..70518e329a9e583d275323b81010c0d245a2e4f6 100644 (file)
@@ -786,7 +786,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 
        /* RDS/IB is not currently netns aware, thus init_net */
        conn = rds_conn_create(&init_net, daddr6, saddr6,
-                              &rds_ib_transport, GFP_KERNEL, ifindex);
+                              &rds_ib_transport, 0, GFP_KERNEL, ifindex);
        if (IS_ERR(conn)) {
                rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
                conn = NULL;
index 63cbc6b8560ca1efcc9e639c745f0f6761197fb2..e37f91537d297c1c5ba7425a11888fab7f827e37 100644 (file)
@@ -115,6 +115,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
                        pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
                                &conn->c_laddr, &conn->c_faddr);
                        conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
+                       conn->c_tos = 0;
                        rds_conn_drop(conn);
                }
                rdsdebug("Connection rejected: %s\n",
index 660023f08553b5eb2f29631c8451b0532f25d45e..7e52b92092d718e5e3742bd6d5c0e70ddd42d77c 100644 (file)
@@ -158,6 +158,9 @@ struct rds_connection {
        unsigned int            c_version;
        possible_net_t          c_net;
 
+       /* TOS */
+       u8                      c_tos;
+
        struct list_head        c_map_item;
        unsigned long           c_map_queued;
 
@@ -652,6 +655,7 @@ struct rds_sock {
        u8                      rs_rx_traces;
        u8                      rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
        struct rds_msg_zcopy_queue rs_zcookie_queue;
+       u8                      rs_tos;
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -760,13 +764,14 @@ void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(struct net *net,
                                       const struct in6_addr *laddr,
                                       const struct in6_addr *faddr,
-                                      struct rds_transport *trans, gfp_t gfp,
+                                      struct rds_transport *trans,
+                                      u8 tos, gfp_t gfp,
                                       int dev_if);
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
                                                const struct in6_addr *laddr,
                                                const struct in6_addr *faddr,
                                                struct rds_transport *trans,
-                                               gfp_t gfp, int dev_if);
+                                               u8 tos, gfp_t gfp, int dev_if);
 void rds_conn_shutdown(struct rds_conn_path *cpath);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
index 6bb6b16ca270aaa89975e5fcf639edd7b2ee82f8..853de48760880603bf60142700cd668100ca289e 100644 (file)
@@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 
        minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
        minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+       minfo.tos = inc->i_conn->c_tos;
 
        if (flip) {
                minfo.laddr = daddr;
index fd8b687d5c05ed665b6b8e770ba398af85b8a2c8..c555e121b908bad328263abba739e80a096e5229 100644 (file)
@@ -1277,12 +1277,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
        /* rds_conn_create has a spinlock that runs with IRQ off.
         * Caching the conn in the socket helps a lot. */
-       if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
+       if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) {
                conn = rs->rs_conn;
-       else {
+       else {
                conn = rds_conn_create_outgoing(sock_net(sock->sk),
                                                &rs->rs_bound_addr, &daddr,
-                                               rs->rs_transport,
+                                               rs->rs_transport, 0,
                                                sock->sk->sk_allocation,
                                                scope_id);
                if (IS_ERR(conn)) {
index c16f0a362c32c302cf71d810b854ba335603848b..eb6851952cbf2084d290c40a50a0760c9f0d5471 100644 (file)
@@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
                tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
                tsinfo.last_expected_una = tc->t_last_expected_una;
                tsinfo.last_seen_una = tc->t_last_seen_una;
+               tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
 
                rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
        }
index c12203f646da92e439dc17a35c6c31be8a56c4c4..810a3a49e9474ed643538e41c1d6cb0c21fca396 100644 (file)
@@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock)
 
        conn = rds_conn_create(sock_net(sock->sk),
                               my_addr, peer_addr,
-                              &rds_tcp_transport, GFP_KERNEL, dev_if);
+                              &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
 
        if (IS_ERR(conn)) {
                ret = PTR_ERR(conn);