mptcp: Handle MP_CAPABLE options for outgoing connections
authorPeter Krystad <peter.krystad@linux.intel.com>
Wed, 22 Jan 2020 00:56:18 +0000 (16:56 -0800)
committerDavid S. Miller <davem@davemloft.net>
Fri, 24 Jan 2020 12:44:07 +0000 (13:44 +0100)
Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request,
to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to
the final ACK of the three-way handshake.

Use the .sk_rx_dst_set() handler in the subflow proto to capture when the
responding SYN-ACK is received and notify the MPTCP connection layer.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/tcp.h
include/net/mptcp.h
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/ipv6/tcp_ipv6.c
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/protocol.h
net/mptcp/subflow.c

index 87794747581410c4c615c1fb31bb40e5140b8750..e9ee06d887fa1dd4606c77e8d163eb297ad5971c 100644 (file)
@@ -137,6 +137,9 @@ struct tcp_request_sock {
        const struct tcp_request_sock_ops *af_specific;
        u64                             snt_synack; /* first SYNACK sent time */
        bool                            tfo_listener;
+#if IS_ENABLED(CONFIG_MPTCP)
+       bool                            is_mptcp;
+#endif
        u32                             txhash;
        u32                             rcv_isn;
        u32                             snt_isn;
index 3daec2ceb3ff0b154fa22d3ef7cfa93c32931654..eabc57c3fde45bcd27377d253c25b3e09f05ad63 100644 (file)
@@ -39,8 +39,27 @@ struct mptcp_out_options {
 
 void mptcp_init(void);
 
+static inline bool sk_is_mptcp(const struct sock *sk)
+{
+       return tcp_sk(sk)->is_mptcp;
+}
+
+static inline bool rsk_is_mptcp(const struct request_sock *req)
+{
+       return tcp_rsk(req)->is_mptcp;
+}
+
 void mptcp_parse_option(const unsigned char *ptr, int opsize,
                        struct tcp_options_received *opt_rx);
+bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+                      struct mptcp_out_options *opts);
+void mptcp_rcv_synsent(struct sock *sk);
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+                         struct mptcp_out_options *opts);
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+                              unsigned int *size, unsigned int remaining,
+                              struct mptcp_out_options *opts);
+
 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts);
 
 /* move the skb extension owership, with the assumption that 'to' is
@@ -89,11 +108,47 @@ static inline void mptcp_init(void)
 {
 }
 
+static inline bool sk_is_mptcp(const struct sock *sk)
+{
+       return false;
+}
+
+static inline bool rsk_is_mptcp(const struct request_sock *req)
+{
+       return false;
+}
+
 static inline void mptcp_parse_option(const unsigned char *ptr, int opsize,
                                      struct tcp_options_received *opt_rx)
 {
 }
 
+static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+                                    struct mptcp_out_options *opts)
+{
+       return false;
+}
+
+static inline void mptcp_rcv_synsent(struct sock *sk)
+{
+}
+
+static inline bool mptcp_synack_options(const struct request_sock *req,
+                                       unsigned int *size,
+                                       struct mptcp_out_options *opts)
+{
+       return false;
+}
+
+static inline bool mptcp_established_options(struct sock *sk,
+                                            struct sk_buff *skb,
+                                            unsigned int *size,
+                                            unsigned int remaining,
+                                            struct mptcp_out_options *opts)
+{
+       return false;
+}
+
 static inline void mptcp_skb_ext_move(struct sk_buff *to,
                                      const struct sk_buff *from)
 {
@@ -107,6 +162,8 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 
 #endif /* CONFIG_MPTCP */
 
+void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped);
+
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 int mptcpv6_init(void);
 #elif IS_ENABLED(CONFIG_IPV6)
index 3458ee13e6f056d385858b8a2b8792ea0d915fe3..5165c8de47ee4e9e589ab9f0d156b5620f1d93fa 100644 (file)
@@ -5978,6 +5978,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                tcp_initialize_rcv_mss(sk);
 
+               if (sk_is_mptcp(sk))
+                       mptcp_rcv_synsent(sk);
+
                /* Remember, tcp_poll() does not lock socket!
                 * Change state from SYN-SENT only after copied_seq
                 * is initialized. */
@@ -6600,6 +6603,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
        tcp_rsk(req)->af_specific = af_ops;
        tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+       tcp_rsk(req)->is_mptcp = 0;
+#endif
 
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = af_ops->mss_clamp;
index 0f0984f39f67f502e65e6e50afb5913b0932cf52..5456076166dac3bfbc583e2ceeb2cfbdc2afa646 100644 (file)
@@ -597,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp,
 #endif
 }
 
+static void mptcp_set_option_cond(const struct request_sock *req,
+                                 struct tcp_out_options *opts,
+                                 unsigned int *remaining)
+{
+       if (rsk_is_mptcp(req)) {
+               unsigned int size;
+
+               if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+                       if (*remaining >= size) {
+                               opts->options |= OPTION_MPTCP;
+                               *remaining -= size;
+                       }
+               }
+       }
+}
+
 /* Compute TCP options for SYN packets. This is not the final
  * network wire format yet.
  */
@@ -666,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
        smc_set_option(tp, opts, &remaining);
 
+       if (sk_is_mptcp(sk)) {
+               unsigned int size;
+
+               if (mptcp_syn_options(sk, &size, &opts->mptcp)) {
+                       opts->options |= OPTION_MPTCP;
+                       remaining -= size;
+               }
+       }
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -727,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
                }
        }
 
+       mptcp_set_option_cond(req, opts, &remaining);
+
        smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
 
        return MAX_TCP_OPTION_SPACE - remaining;
@@ -764,6 +791,23 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
                size += TCPOLEN_TSTAMP_ALIGNED;
        }
 
+       /* MPTCP options have precedence over SACK for the limited TCP
+        * option space because a MPTCP connection would be forced to
+        * fall back to regular TCP if a required multipath option is
+        * missing. SACK still gets a chance to use whatever space is
+        * left.
+        */
+       if (sk_is_mptcp(sk)) {
+               unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+               unsigned int opt_size = 0;
+
+               if (mptcp_established_options(sk, skb, &opt_size, remaining,
+                                             &opts->mptcp)) {
+                       opts->options |= OPTION_MPTCP;
+                       size += opt_size;
+               }
+       }
+
        eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
        if (unlikely(eff_sacks)) {
                const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
index 60068ffde1d926240d3d9410d33b4ae9d25738e1..33a578a3eb3abadb2b9d368433f1399d204f14a2 100644 (file)
@@ -238,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
                icsk->icsk_af_ops = &ipv6_mapped;
+               if (sk_is_mptcp(sk))
+                       mptcp_handle_ipv6_mapped(sk, true);
                sk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                tp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -248,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                if (err) {
                        icsk->icsk_ext_hdr_len = exthdrlen;
                        icsk->icsk_af_ops = &ipv6_specific;
+                       if (sk_is_mptcp(sk))
+                               mptcp_handle_ipv6_mapped(sk, false);
                        sk->sk_backlog_rcv = tcp_v6_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                        tp->af_specific = &tcp_sock_ipv6_specific;
@@ -1203,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
                newnp->saddr = newsk->sk_v6_rcv_saddr;
 
                inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+               if (sk_is_mptcp(newsk))
+                       mptcp_handle_ipv6_mapped(newsk, true);
                newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
index b7a31c0e5283061b1e879891124b3518ea65e13c..52ff2301b68bfb4b3e0aa43aeb8a43a51fd78332 100644 (file)
@@ -72,14 +72,114 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize,
        }
 }
 
+void mptcp_get_options(const struct sk_buff *skb,
+                      struct tcp_options_received *opt_rx)
+{
+       const unsigned char *ptr;
+       const struct tcphdr *th = tcp_hdr(skb);
+       int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+       ptr = (const unsigned char *)(th + 1);
+
+       while (length > 0) {
+               int opcode = *ptr++;
+               int opsize;
+
+               switch (opcode) {
+               case TCPOPT_EOL:
+                       return;
+               case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                       length--;
+                       continue;
+               default:
+                       opsize = *ptr++;
+                       if (opsize < 2) /* "silly options" */
+                               return;
+                       if (opsize > length)
+                               return; /* don't parse partial options */
+                       if (opcode == TCPOPT_MPTCP)
+                               mptcp_parse_option(ptr, opsize, opt_rx);
+                       ptr += opsize - 2;
+                       length -= opsize;
+               }
+       }
+}
+
+bool mptcp_syn_options(struct sock *sk, unsigned int *size,
+                      struct mptcp_out_options *opts)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+       if (subflow->request_mptcp) {
+               pr_debug("local_key=%llu", subflow->local_key);
+               opts->suboptions = OPTION_MPTCP_MPC_SYN;
+               opts->sndr_key = subflow->local_key;
+               *size = TCPOLEN_MPTCP_MPC_SYN;
+               return true;
+       }
+       return false;
+}
+
+void mptcp_rcv_synsent(struct sock *sk)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       pr_debug("subflow=%p", subflow);
+       if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
+               subflow->mp_capable = 1;
+               subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
+       } else {
+               tcp_sk(sk)->is_mptcp = 0;
+       }
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+                              unsigned int *size, unsigned int remaining,
+                              struct mptcp_out_options *opts)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+       if (subflow->mp_capable && !subflow->fourth_ack) {
+               opts->suboptions = OPTION_MPTCP_MPC_ACK;
+               opts->sndr_key = subflow->local_key;
+               opts->rcvr_key = subflow->remote_key;
+               *size = TCPOLEN_MPTCP_MPC_ACK;
+               subflow->fourth_ack = 1;
+               pr_debug("subflow=%p, local_key=%llu, remote_key=%llu",
+                        subflow, subflow->local_key, subflow->remote_key);
+               return true;
+       }
+       return false;
+}
+
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+                         struct mptcp_out_options *opts)
+{
+       struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+       if (subflow_req->mp_capable) {
+               opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
+               opts->sndr_key = subflow_req->local_key;
+               *size = TCPOLEN_MPTCP_MPC_SYNACK;
+               pr_debug("subflow_req=%p, local_key=%llu",
+                        subflow_req, subflow_req->local_key);
+               return true;
+       }
+       return false;
+}
+
 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 {
        if ((OPTION_MPTCP_MPC_SYN |
+            OPTION_MPTCP_MPC_SYNACK |
             OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
                u8 len;
 
                if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
                        len = TCPOLEN_MPTCP_MPC_SYN;
+               else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
+                       len = TCPOLEN_MPTCP_MPC_SYNACK;
                else
                        len = TCPOLEN_MPTCP_MPC_ACK;
 
index 294b03a0393acb7d3b74863a77c0f012f0487c1e..bdd58da1e4f6537aa6c3704c71c4dadd68c53df9 100644 (file)
  */
 static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
 {
-       if (!msk->subflow)
+       if (!msk->subflow || mptcp_subflow_ctx(msk->subflow->sk)->fourth_ack)
                return NULL;
 
        return msk->subflow;
 }
 
+/* if msk has a single subflow, and the mp_capable handshake is failed,
+ * return it.
+ * Otherwise returns NULL
+ */
+static struct socket *__mptcp_tcp_fallback(const struct mptcp_sock *msk)
+{
+       struct socket *ssock = __mptcp_nmpc_socket(msk);
+
+       sock_owned_by_me((const struct sock *)msk);
+
+       if (!ssock || sk_is_mptcp(ssock->sk))
+               return NULL;
+
+       return ssock;
+}
+
 static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
 {
        return ((struct sock *)msk)->sk_state == TCP_CLOSE;
@@ -56,6 +72,7 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
 
        msk->subflow = ssock;
        subflow = mptcp_subflow_ctx(ssock->sk);
+       list_add(&subflow->node, &msk->conn_list);
        subflow->request_mptcp = 1;
 
 set_state:
@@ -64,66 +81,169 @@ set_state:
        return ssock;
 }
 
+static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+{
+       struct mptcp_subflow_context *subflow;
+
+       sock_owned_by_me((const struct sock *)msk);
+
+       mptcp_for_each_subflow(msk, subflow) {
+               return mptcp_subflow_tcp_sock(subflow);
+       }
+
+       return NULL;
+}
+
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
        struct mptcp_sock *msk = mptcp_sk(sk);
-       struct socket *subflow = msk->subflow;
+       struct socket *ssock;
+       struct sock *ssk;
+       int ret;
 
        if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
                return -EOPNOTSUPP;
 
-       return sock_sendmsg(subflow, msg);
+       lock_sock(sk);
+       ssock = __mptcp_tcp_fallback(msk);
+       if (ssock) {
+               pr_debug("fallback passthrough");
+               ret = sock_sendmsg(ssock, msg);
+               release_sock(sk);
+               return ret;
+       }
+
+       ssk = mptcp_subflow_get(msk);
+       if (!ssk) {
+               release_sock(sk);
+               return -ENOTCONN;
+       }
+
+       ret = sock_sendmsg(ssk->sk_socket, msg);
+
+       release_sock(sk);
+       return ret;
 }
 
 static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                         int nonblock, int flags, int *addr_len)
 {
        struct mptcp_sock *msk = mptcp_sk(sk);
-       struct socket *subflow = msk->subflow;
+       struct socket *ssock;
+       struct sock *ssk;
+       int copied = 0;
 
        if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
                return -EOPNOTSUPP;
 
-       return sock_recvmsg(subflow, msg, flags);
+       lock_sock(sk);
+       ssock = __mptcp_tcp_fallback(msk);
+       if (ssock) {
+               pr_debug("fallback-read subflow=%p",
+                        mptcp_subflow_ctx(ssock->sk));
+               copied = sock_recvmsg(ssock, msg, flags);
+               release_sock(sk);
+               return copied;
+       }
+
+       ssk = mptcp_subflow_get(msk);
+       if (!ssk) {
+               release_sock(sk);
+               return -ENOTCONN;
+       }
+
+       copied = sock_recvmsg(ssk->sk_socket, msg, flags);
+
+       release_sock(sk);
+
+       return copied;
+}
+
+/* subflow sockets can be either outgoing (connect) or incoming
+ * (accept).
+ *
+ * Outgoing subflows use in-kernel sockets.
+ * Incoming subflows do not have their own 'struct socket' allocated,
+ * so we need to use tcp_close() after detaching them from the mptcp
+ * parent socket.
+ */
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+                             struct mptcp_subflow_context *subflow,
+                             long timeout)
+{
+       struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+       list_del(&subflow->node);
+
+       if (sock && sock != sk->sk_socket) {
+               /* outgoing subflow */
+               sock_release(sock);
+       } else {
+               /* incoming subflow */
+               tcp_close(ssk, timeout);
+       }
 }
 
 static int mptcp_init_sock(struct sock *sk)
 {
+       struct mptcp_sock *msk = mptcp_sk(sk);
+
+       INIT_LIST_HEAD(&msk->conn_list);
+
        return 0;
 }
 
 static void mptcp_close(struct sock *sk, long timeout)
 {
+       struct mptcp_subflow_context *subflow, *tmp;
        struct mptcp_sock *msk = mptcp_sk(sk);
-       struct socket *ssock;
 
        inet_sk_state_store(sk, TCP_CLOSE);
 
-       ssock = __mptcp_nmpc_socket(msk);
-       if (ssock) {
-               pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk));
-               sock_release(ssock);
+       lock_sock(sk);
+
+       list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+               struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+               __mptcp_close_ssk(sk, ssk, subflow, timeout);
        }
 
-       sock_orphan(sk);
-       sock_put(sk);
+       release_sock(sk);
+       sk_common_release(sk);
 }
 
-static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len)
+static int mptcp_get_port(struct sock *sk, unsigned short snum)
 {
        struct mptcp_sock *msk = mptcp_sk(sk);
-       int err;
+       struct socket *ssock;
 
-       saddr->sa_family = AF_INET;
+       ssock = __mptcp_nmpc_socket(msk);
+       pr_debug("msk=%p, subflow=%p", msk, ssock);
+       if (WARN_ON_ONCE(!ssock))
+               return -EINVAL;
 
-       pr_debug("msk=%p, subflow=%p", msk,
-                mptcp_subflow_ctx(msk->subflow->sk));
+       return inet_csk_get_port(ssock->sk, snum);
+}
 
-       err = kernel_connect(msk->subflow, saddr, len, 0);
+void mptcp_finish_connect(struct sock *ssk)
+{
+       struct mptcp_subflow_context *subflow;
+       struct mptcp_sock *msk;
+       struct sock *sk;
 
-       sk->sk_state = TCP_ESTABLISHED;
+       subflow = mptcp_subflow_ctx(ssk);
 
-       return err;
+       if (!subflow->mp_capable)
+               return;
+
+       sk = subflow->conn;
+       msk = mptcp_sk(sk);
+
+       /* the socket is not connected yet, no msk/subflow ops can access/race
+        * accessing the field below
+        */
+       WRITE_ONCE(msk->remote_key, subflow->remote_key);
+       WRITE_ONCE(msk->local_key, subflow->local_key);
 }
 
 static struct proto mptcp_prot = {
@@ -132,13 +252,12 @@ static struct proto mptcp_prot = {
        .init           = mptcp_init_sock,
        .close          = mptcp_close,
        .accept         = inet_csk_accept,
-       .connect        = mptcp_connect,
        .shutdown       = tcp_shutdown,
        .sendmsg        = mptcp_sendmsg,
        .recvmsg        = mptcp_recvmsg,
        .hash           = inet_hash,
        .unhash         = inet_unhash,
-       .get_port       = inet_csk_get_port,
+       .get_port       = mptcp_get_port,
        .obj_size       = sizeof(struct mptcp_sock),
        .no_autobind    = true,
 };
index 543d4d5d8985b690e27fa8fc3e64095873c38471..bd66e74155158a4e73c3a3ad175bb3fbab4a87f9 100644 (file)
 struct mptcp_sock {
        /* inet_connection_sock must be the first member */
        struct inet_connection_sock sk;
+       u64             local_key;
+       u64             remote_key;
+       struct list_head conn_list;
        struct socket   *subflow; /* outgoing connect/listener/!mp_capable */
 };
 
+#define mptcp_for_each_subflow(__msk, __subflow)                       \
+       list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+
 static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 {
        return (struct mptcp_sock *)sk;
 }
 
+struct mptcp_subflow_request_sock {
+       struct  tcp_request_sock sk;
+       u8      mp_capable : 1,
+               mp_join : 1,
+               backup : 1;
+       u64     local_key;
+       u64     remote_key;
+};
+
+static inline struct mptcp_subflow_request_sock *
+mptcp_subflow_rsk(const struct request_sock *rsk)
+{
+       return (struct mptcp_subflow_request_sock *)rsk;
+}
+
 /* MPTCP subflow context */
 struct mptcp_subflow_context {
-       u32     request_mptcp : 1;  /* send MP_CAPABLE */
+       struct  list_head node;/* conn_list of subflows */
+       u64     local_key;
+       u64     remote_key;
+       u32     request_mptcp : 1,  /* send MP_CAPABLE */
+               mp_capable : 1,     /* remote is MPTCP capable */
+               fourth_ack : 1,     /* send initial DSS */
+               conn_finished : 1;
        struct  sock *tcp_sock;     /* tcp sk backpointer */
        struct  sock *conn;         /* parent mptcp_sock */
+       const   struct inet_connection_sock_af_ops *icsk_af_ops;
        struct  rcu_head rcu;
 };
 
@@ -74,4 +102,14 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
 void mptcp_subflow_init(void);
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 
+extern const struct inet_connection_sock_af_ops ipv4_specific;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+extern const struct inet_connection_sock_af_ops ipv6_specific;
+#endif
+
+void mptcp_get_options(const struct sk_buff *skb,
+                      struct tcp_options_received *opt_rx);
+
+void mptcp_finish_connect(struct sock *sk);
+
 #endif /* __MPTCP_PROTOCOL_H */
index bf813935365352892c007da7de61d20ad68aba90..df3192305967f55403f2189681f20016cd0af70c 100644 (file)
 #include <net/inet_hashtables.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/ip6_route.h>
+#endif
 #include <net/mptcp.h>
 #include "protocol.h"
 
+static void subflow_init_req(struct request_sock *req,
+                            const struct sock *sk_listener,
+                            struct sk_buff *skb)
+{
+       struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+       struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+       struct tcp_options_received rx_opt;
+
+       pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+       memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
+       mptcp_get_options(skb, &rx_opt);
+
+       subflow_req->mp_capable = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+       /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+        * TCP option space.
+        */
+       if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+               return;
+#endif
+
+       if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
+               subflow_req->mp_capable = 1;
+               subflow_req->remote_key = rx_opt.mptcp.sndr_key;
+       }
+}
+
+static void subflow_v4_init_req(struct request_sock *req,
+                               const struct sock *sk_listener,
+                               struct sk_buff *skb)
+{
+       tcp_rsk(req)->is_mptcp = 1;
+
+       tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+
+       subflow_init_req(req, sk_listener, skb);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static void subflow_v6_init_req(struct request_sock *req,
+                               const struct sock *sk_listener,
+                               struct sk_buff *skb)
+{
+       tcp_rsk(req)->is_mptcp = 1;
+
+       tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+
+       subflow_init_req(req, sk_listener, skb);
+}
+#endif
+
+static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+       subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
+
+       if (subflow->conn && !subflow->conn_finished) {
+               pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
+                        subflow->remote_key);
+               mptcp_finish_connect(sk);
+               subflow->conn_finished = 1;
+       }
+}
+
+static struct request_sock_ops subflow_request_sock_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+
+static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+       pr_debug("subflow=%p", subflow);
+
+       /* Never answer to SYNs sent to broadcast or multicast */
+       if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+               goto drop;
+
+       return tcp_conn_request(&subflow_request_sock_ops,
+                               &subflow_request_sock_ipv4_ops,
+                               sk, skb);
+drop:
+       tcp_listendrop(sk);
+       return 0;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
+static struct inet_connection_sock_af_ops subflow_v6_specific;
+static struct inet_connection_sock_af_ops subflow_v6m_specific;
+
+static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+       pr_debug("subflow=%p", subflow);
+
+       if (skb->protocol == htons(ETH_P_IP))
+               return subflow_v4_conn_request(sk, skb);
+
+       if (!ipv6_unicast_destination(skb))
+               goto drop;
+
+       return tcp_conn_request(&subflow_request_sock_ops,
+                               &subflow_request_sock_ipv6_ops, sk, skb);
+
+drop:
+       tcp_listendrop(sk);
+       return 0; /* don't send reset */
+}
+#endif
+
+static struct sock *subflow_syn_recv_sock(const struct sock *sk,
+                                         struct sk_buff *skb,
+                                         struct request_sock *req,
+                                         struct dst_entry *dst,
+                                         struct request_sock *req_unhash,
+                                         bool *own_req)
+{
+       struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
+       struct sock *child;
+
+       pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+
+       /* if the sk is MP_CAPABLE, we already received the client key */
+
+       child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+                                                    req_unhash, own_req);
+
+       if (child && *own_req) {
+               if (!mptcp_subflow_ctx(child)) {
+                       pr_debug("Closing child socket");
+                       inet_sk_set_state(child, TCP_CLOSE);
+                       sock_set_flag(child, SOCK_DEAD);
+                       inet_csk_destroy_sock(child);
+                       child = NULL;
+               }
+       }
+
+       return child;
+}
+
+static struct inet_connection_sock_af_ops subflow_specific;
+
+static struct inet_connection_sock_af_ops *
+subflow_default_af_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       if (sk->sk_family == AF_INET6)
+               return &subflow_v6_specific;
+#endif
+       return &subflow_specific;
+}
+
+void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_connection_sock_af_ops *target;
+
+       target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
+
+       pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+                subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
+
+       if (likely(icsk->icsk_af_ops == target))
+               return;
+
+       subflow->icsk_af_ops = icsk->icsk_af_ops;
+       icsk->icsk_af_ops = target;
+#endif
+}
+
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 {
        struct mptcp_subflow_context *subflow;
@@ -22,7 +201,8 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
        struct socket *sf;
        int err;
 
-       err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf);
+       err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
+                              &sf);
        if (err)
                return err;
 
@@ -60,6 +240,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
                return NULL;
 
        rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+       INIT_LIST_HEAD(&ctx->node);
 
        pr_debug("subflow=%p", ctx);
 
@@ -70,6 +251,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
 
 static int subflow_ulp_init(struct sock *sk)
 {
+       struct inet_connection_sock *icsk = inet_csk(sk);
        struct mptcp_subflow_context *ctx;
        struct tcp_sock *tp = tcp_sk(sk);
        int err = 0;
@@ -91,6 +273,8 @@ static int subflow_ulp_init(struct sock *sk)
        pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
 
        tp->is_mptcp = 1;
+       ctx->icsk_af_ops = icsk->icsk_af_ops;
+       icsk->icsk_af_ops = subflow_default_af_ops(sk);
 out:
        return err;
 }
@@ -105,15 +289,97 @@ static void subflow_ulp_release(struct sock *sk)
        kfree_rcu(ctx, rcu);
 }
 
+static void subflow_ulp_fallback(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       icsk->icsk_ulp_ops = NULL;
+       rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+       tcp_sk(sk)->is_mptcp = 0;
+}
+
+static void subflow_ulp_clone(const struct request_sock *req,
+                             struct sock *newsk,
+                             const gfp_t priority)
+{
+       struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+       struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
+       struct mptcp_subflow_context *new_ctx;
+
+       if (!subflow_req->mp_capable) {
+               subflow_ulp_fallback(newsk);
+               return;
+       }
+
+       new_ctx = subflow_create_ctx(newsk, priority);
+       if (new_ctx == NULL) {
+               subflow_ulp_fallback(newsk);
+               return;
+       }
+
+       new_ctx->conn_finished = 1;
+       new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+       new_ctx->mp_capable = 1;
+       new_ctx->fourth_ack = 1;
+       new_ctx->remote_key = subflow_req->remote_key;
+       new_ctx->local_key = subflow_req->local_key;
+}
+
 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
        .name           = "mptcp",
        .owner          = THIS_MODULE,
        .init           = subflow_ulp_init,
        .release        = subflow_ulp_release,
+       .clone          = subflow_ulp_clone,
 };
 
+static int subflow_ops_init(struct request_sock_ops *subflow_ops)
+{
+       subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
+       subflow_ops->slab_name = "request_sock_subflow";
+
+       subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
+                                             subflow_ops->obj_size, 0,
+                                             SLAB_ACCOUNT |
+                                             SLAB_TYPESAFE_BY_RCU,
+                                             NULL);
+       if (!subflow_ops->slab)
+               return -ENOMEM;
+
+       return 0;
+}
+
 void mptcp_subflow_init(void)
 {
+       subflow_request_sock_ops = tcp_request_sock_ops;
+       if (subflow_ops_init(&subflow_request_sock_ops) != 0)
+               panic("MPTCP: failed to init subflow request sock ops\n");
+
+       subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+       subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+
+       subflow_specific = ipv4_specific;
+       subflow_specific.conn_request = subflow_v4_conn_request;
+       subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
+       subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+       subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+
+       subflow_v6_specific = ipv6_specific;
+       subflow_v6_specific.conn_request = subflow_v6_conn_request;
+       subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
+       subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+
+       subflow_v6m_specific = subflow_v6_specific;
+       subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
+       subflow_v6m_specific.send_check = ipv4_specific.send_check;
+       subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
+       subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
+       subflow_v6m_specific.net_frag_header_len = 0;
+#endif
+
        if (tcp_register_ulp(&subflow_ulp_ops) != 0)
                panic("MPTCP: failed to register subflows to ULP\n");
 }