mptcp: Add handling of outgoing MP_JOIN requests
authorPeter Krystad <peter.krystad@linux.intel.com>
Fri, 27 Mar 2020 21:48:40 +0000 (14:48 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 30 Mar 2020 05:14:48 +0000 (22:14 -0700)
Subflow creation may be initiated by the path manager when
the primary connection is fully established and a remote
address has been received via ADD_ADDR.

Create an in-kernel sock and use kernel_connect() to
initiate connection.

Passive sockets can't acquire the mptcp socket lock at
subflow creation time, so an additional list protected by
a new spinlock is used to track the MPJ subflows.

Such list is spliced into conn_list tail every time the msk
socket lock is acquired, so that it will not interfere
with data flow on the original connection.

Data flow and connection failover not addressed by this commit.

Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/mptcp.h
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/protocol.h
net/mptcp/subflow.c

index a4aea0e4addc351a415769e1fd6a6092adb3e240..b648fa20eec8f7417a11f90e0f811f9224780cb2 100644 (file)
@@ -46,6 +46,8 @@ struct mptcp_out_options {
        u8 backup;
        u32 nonce;
        u64 thmac;
+       u32 token;
+       u8 hmac[20];
        struct mptcp_ext ext_copy;
 #endif
 };
index 8e2b2dbadf6d5724534ba023b4d6c4e5b400ac73..20ba00865c55ce2c2d0643378a49e5c0444d198f 100644 (file)
@@ -328,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                opts->sndr_key = subflow->local_key;
                *size = TCPOLEN_MPTCP_MPC_SYN;
                return true;
+       } else if (subflow->request_join) {
+               pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+                        subflow->local_nonce);
+               opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+               opts->join_id = subflow->local_id;
+               opts->token = subflow->remote_token;
+               opts->nonce = subflow->local_nonce;
+               opts->backup = subflow->request_bkup;
+               *size = TCPOLEN_MPTCP_MPJ_SYN;
+               return true;
        }
        return false;
 }
@@ -337,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk)
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct tcp_sock *tp = tcp_sk(sk);
 
-       pr_debug("subflow=%p", subflow);
        if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
                subflow->mp_capable = 1;
                subflow->can_ack = 1;
                subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
-       } else {
+               pr_debug("subflow=%p, remote_key=%llu", subflow,
+                        subflow->remote_key);
+       } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
+               subflow->mp_join = 1;
+               subflow->thmac = tp->rx_opt.mptcp.thmac;
+               subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
+               pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+                        subflow->thmac, subflow->remote_nonce);
+       } else if (subflow->request_mptcp) {
                tcp_sk(sk)->is_mptcp = 0;
        }
 }
 
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       unsigned long timeout;
+
+       /* reschedule with a timeout above RTT, as we must look only for drop */
+       if (tp->srtt_us)
+               timeout = tp->srtt_us << 1;
+       else
+               timeout = TCP_TIMEOUT_INIT;
+
+       WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+       icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+       icsk->icsk_ack.timeout = timeout;
+       sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       sk_stop_timer(sk, &icsk->icsk_delack_timer);
+       icsk->icsk_ack.timeout = 0;
+       icsk->icsk_ack.ato = 0;
+       icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                                         unsigned int *size,
                                         unsigned int remaining,
@@ -356,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
        struct mptcp_ext *mpext;
        unsigned int data_len;
 
-       pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d",
-                subflow, subflow->fully_established, subflow->snd_isn,
-                skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+       /* When skb is not available, we better over-estimate the emitted
+        * options len. A full DSS option (28 bytes) is longer than
+        * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+        * tell the caller to defer the estimate to
+        * mptcp_established_options_dss(), which will reserve enough space.
+        */
+       if (!skb)
+               return false;
 
-       if (subflow->mp_capable && !subflow->fully_established && skb &&
-           subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
-               /* When skb is not available, we better over-estimate the
-                * emitted options len. A full DSS option is longer than
-                * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
-                * that.
-                */
+       /* MPC/MPJ needed only on 3rd ack packet */
+       if (subflow->fully_established ||
+           subflow->snd_isn != TCP_SKB_CB(skb)->seq)
+               return false;
+
+       if (subflow->mp_capable) {
                mpext = mptcp_get_ext(skb);
                data_len = mpext ? mpext->data_len : 0;
 
@@ -394,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                         data_len);
 
                return true;
+       } else if (subflow->mp_join) {
+               opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+               memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+               *size = TCPOLEN_MPTCP_MPJ_ACK;
+               pr_debug("subflow=%p", subflow);
+
+               schedule_3rdack_retransmission(sk);
+               return true;
        }
        return false;
 }
@@ -674,10 +735,12 @@ fully_established:
                return true;
 
        subflow->pm_notified = 1;
-       if (subflow->mp_join)
+       if (subflow->mp_join) {
+               clear_3rdack_retransmission(sk);
                mptcp_pm_subflow_established(msk, subflow);
-       else
+       } else {
                mptcp_pm_fully_established(msk);
+       }
        return true;
 }
 
@@ -860,6 +923,16 @@ mp_capable_done:
                                      0, opts->rm_id);
        }
 
+       if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+               *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+                                     TCPOLEN_MPTCP_MPJ_SYN,
+                                     opts->backup, opts->join_id);
+               put_unaligned_be32(opts->token, ptr);
+               ptr += 1;
+               put_unaligned_be32(opts->nonce, ptr);
+               ptr += 1;
+       }
+
        if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
                *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
                                      TCPOLEN_MPTCP_MPJ_SYNACK,
@@ -870,6 +943,13 @@ mp_capable_done:
                ptr += 1;
        }
 
+       if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
+               *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+                                     TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+               memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+               ptr += 5;
+       }
+
        if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
                struct mptcp_ext *mpext = &opts->ext_copy;
                u8 len = TCPOLEN_MPTCP_DSS_BASE;
index f2dac715a17ee2edc46c273b454a54cbceb915ce..3d84e0b83c99ae14f26d9064005283fd766added 100644 (file)
@@ -241,6 +241,16 @@ wake:
        sk->sk_data_ready(sk);
 }
 
+static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+       if (likely(list_empty(&msk->join_list)))
+               return;
+
+       spin_lock_bh(&msk->join_list_lock);
+       list_splice_tail_init(&msk->join_list, &msk->conn_list);
+       spin_unlock_bh(&msk->join_list_lock);
+}
+
 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
 {
        if (!msk->cached_ext)
@@ -462,6 +472,7 @@ fallback:
                return ret >= 0 ? ret + copied : (copied ? copied : ret);
        }
 
+       __mptcp_flush_join_list(msk);
        ssk = mptcp_subflow_get_send(msk);
        while (!sk_stream_memory_free(sk) || !ssk) {
                ret = sk_stream_wait_memory(sk, &timeo);
@@ -603,6 +614,7 @@ fallback:
 
        len = min_t(size_t, len, INT_MAX);
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+       __mptcp_flush_join_list(msk);
 
        while (len > (size_t)copied) {
                int bytes_read;
@@ -718,6 +730,7 @@ static void mptcp_worker(struct work_struct *work)
        struct sock *sk = &msk->sk.icsk_inet.sk;
 
        lock_sock(sk);
+       __mptcp_flush_join_list(msk);
        __mptcp_move_skbs(msk);
        release_sock(sk);
        sock_put(sk);
@@ -727,7 +740,10 @@ static int __mptcp_init_sock(struct sock *sk)
 {
        struct mptcp_sock *msk = mptcp_sk(sk);
 
+       spin_lock_init(&msk->join_list_lock);
+
        INIT_LIST_HEAD(&msk->conn_list);
+       INIT_LIST_HEAD(&msk->join_list);
        __set_bit(MPTCP_SEND_SPACE, &msk->flags);
        INIT_WORK(&msk->work, mptcp_worker);
 
@@ -800,6 +816,8 @@ static void mptcp_close(struct sock *sk, long timeout)
        mptcp_token_destroy(msk->token);
        inet_sk_state_store(sk, TCP_CLOSE);
 
+       __mptcp_flush_join_list(msk);
+
        list_splice_init(&msk->conn_list, &conn_list);
 
        data_fin_tx_seq = msk->write_seq;
@@ -1107,6 +1125,7 @@ bool mptcp_finish_join(struct sock *sk)
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct sock *parent = (void *)msk;
        struct socket *parent_sock;
+       bool ret;
 
        pr_debug("msk=%p, subflow=%p", msk, subflow);
 
@@ -1122,7 +1141,15 @@ bool mptcp_finish_join(struct sock *sk)
        if (parent_sock && !sk->sk_socket)
                mptcp_sock_graft(sk, parent_sock);
 
-       return mptcp_pm_allow_new_subflow(msk);
+       ret = mptcp_pm_allow_new_subflow(msk);
+       if (ret) {
+               /* active connections are already on conn_list */
+               spin_lock_bh(&msk->join_list_lock);
+               if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
+                       list_add_tail(&subflow->node, &msk->join_list);
+               spin_unlock_bh(&msk->join_list_lock);
+       }
+       return ret;
 }
 
 bool mptcp_sk_is_subflow(const struct sock *sk)
@@ -1311,6 +1338,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
                /* set ssk->sk_socket of accept()ed flows to mptcp socket.
                 * This is needed so NOSPACE flag can be set from tcp stack.
                 */
+               __mptcp_flush_join_list(msk);
                list_for_each_entry(subflow, &msk->conn_list, node) {
                        struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
@@ -1392,6 +1420,7 @@ static int mptcp_shutdown(struct socket *sock, int how)
                        sock->state = SS_CONNECTED;
        }
 
+       __mptcp_flush_join_list(msk);
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
 
index ef94e36b8560c339c9c2dc03473806ec160eba68..df134ac912744c2e9f7b51bf063744f08da0cb69 100644 (file)
 #define TCPOLEN_MPTCP_PORT_LEN         2
 #define TCPOLEN_MPTCP_RM_ADDR_BASE     4
 
+/* MPTCP MP_JOIN flags */
 #define MPTCPOPT_BACKUP                BIT(0)
 #define MPTCPOPT_HMAC_LEN      20
+#define MPTCPOPT_THMAC_LEN     8
 
 /* MPTCP MP_CAPABLE flags */
 #define MPTCP_VERSION_MASK     (0x0F)
@@ -148,8 +150,10 @@ struct mptcp_sock {
        u32             token;
        unsigned long   flags;
        bool            can_ack;
+       spinlock_t      join_list_lock;
        struct work_struct work;
        struct list_head conn_list;
+       struct list_head join_list;
        struct skb_ext  *cached_ext;    /* for the next sendmsg */
        struct socket   *subflow; /* outgoing connect/listener/!mp_capable */
        struct sock     *first;
@@ -202,6 +206,8 @@ struct mptcp_subflow_context {
        u32     ssn_offset;
        u32     map_data_len;
        u32     request_mptcp : 1,  /* send MP_CAPABLE */
+               request_join : 1,   /* send MP_JOIN */
+               request_bkup : 1,
                mp_capable : 1,     /* remote is MPTCP capable */
                mp_join : 1,        /* remote is JOINing */
                fully_established : 1,      /* path validated */
@@ -218,6 +224,8 @@ struct mptcp_subflow_context {
        u32     remote_nonce;
        u64     thmac;
        u32     local_nonce;
+       u32     remote_token;
+       u8      hmac[MPTCPOPT_HMAC_LEN];
        u8      local_id;
        u8      remote_id;
 
@@ -263,6 +271,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
 int mptcp_is_enabled(struct net *net);
 bool mptcp_subflow_data_available(struct sock *sk);
 void mptcp_subflow_init(void);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+                           const struct mptcp_addr_info *loc,
+                           const struct mptcp_addr_info *remote);
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 
 static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
index e7caa4f6e1e54077123587bf4fda6835dff43bc5..ba636cd84a3cb8c3dccd30e60c4ca095e33027db 100644 (file)
 static int subflow_rebuild_header(struct sock *sk)
 {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-       int err = 0;
+       int local_id, err = 0;
 
        if (subflow->request_mptcp && !subflow->token) {
                pr_debug("subflow=%p", sk);
                err = mptcp_token_new_connect(sk);
+       } else if (subflow->request_join && !subflow->local_nonce) {
+               struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
+
+               pr_debug("subflow=%p", sk);
+
+               do {
+                       get_random_bytes(&subflow->local_nonce, sizeof(u32));
+               } while (!subflow->local_nonce);
+
+               if (subflow->local_id)
+                       goto out;
+
+               local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+               if (local_id < 0)
+                       return -EINVAL;
+
+               subflow->local_id = local_id;
        }
 
+out:
        if (err)
                return err;
 
@@ -131,6 +149,7 @@ static void subflow_init_req(struct request_sock *req,
 
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
        } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+               subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
                subflow_req->mp_join = 1;
                subflow_req->backup = rx_opt.mptcp.backup;
                subflow_req->remote_id = rx_opt.mptcp.join_id;
@@ -169,6 +188,25 @@ static void subflow_v6_init_req(struct request_sock *req,
 }
 #endif
 
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+       u8 hmac[MPTCPOPT_HMAC_LEN];
+       u64 thmac;
+
+       subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+                             subflow->remote_nonce, subflow->local_nonce,
+                             hmac);
+
+       thmac = get_unaligned_be64(hmac);
+       pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+                subflow, subflow->token,
+                (unsigned long long)thmac,
+                (unsigned long long)subflow->thmac);
+
+       return thmac == subflow->thmac;
+}
+
 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -181,7 +219,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
                parent->sk_state_change(parent);
        }
 
-       if (!subflow->conn_finished) {
+       if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+               return;
+
+       if (subflow->mp_capable) {
                pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
                         subflow->remote_key);
                mptcp_finish_connect(sk);
@@ -191,6 +232,31 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
                        pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
                        subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
                }
+       } else if (subflow->mp_join) {
+               pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
+                        subflow, subflow->thmac,
+                        subflow->remote_nonce);
+               if (!subflow_thmac_valid(subflow)) {
+                       subflow->mp_join = 0;
+                       goto do_reset;
+               }
+
+               subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+                                     subflow->local_nonce,
+                                     subflow->remote_nonce,
+                                     subflow->hmac);
+
+               if (skb)
+                       subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+
+               if (!mptcp_finish_join(sk))
+                       goto do_reset;
+
+               subflow->conn_finished = 1;
+       } else {
+do_reset:
+               tcp_send_active_reset(sk, GFP_ATOMIC);
+               tcp_done(sk);
        }
 }
 
@@ -737,6 +803,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
 }
 #endif
 
+static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+                               struct sockaddr_storage *addr)
+{
+       memset(addr, 0, sizeof(*addr));
+       addr->ss_family = info->family;
+       if (addr->ss_family == AF_INET) {
+               struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+               in_addr->sin_addr = info->addr;
+               in_addr->sin_port = info->port;
+       }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       else if (addr->ss_family == AF_INET6) {
+               struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+               in6_addr->sin6_addr = info->addr6;
+               in6_addr->sin6_port = info->port;
+       }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+                           const struct mptcp_addr_info *loc,
+                           const struct mptcp_addr_info *remote)
+{
+       struct mptcp_sock *msk = mptcp_sk(sk);
+       struct mptcp_subflow_context *subflow;
+       struct sockaddr_storage addr;
+       struct socket *sf;
+       u32 remote_token;
+       int addrlen;
+       int err;
+
+       if (sk->sk_state != TCP_ESTABLISHED)
+               return -ENOTCONN;
+
+       err = mptcp_subflow_create_socket(sk, &sf);
+       if (err)
+               return err;
+
+       subflow = mptcp_subflow_ctx(sf->sk);
+       subflow->remote_key = msk->remote_key;
+       subflow->local_key = msk->local_key;
+       subflow->token = msk->token;
+       mptcp_info2sockaddr(loc, &addr);
+
+       addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       if (loc->family == AF_INET6)
+               addrlen = sizeof(struct sockaddr_in6);
+#endif
+       sf->sk->sk_bound_dev_if = ifindex;
+       err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+       if (err)
+               goto failed;
+
+       mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+       pr_debug("msk=%p remote_token=%u", msk, remote_token);
+       subflow->remote_token = remote_token;
+       subflow->local_id = loc->id;
+       subflow->request_join = 1;
+       subflow->request_bkup = 1;
+       mptcp_info2sockaddr(remote, &addr);
+
+       err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+       if (err && err != -EINPROGRESS)
+               goto failed;
+
+       spin_lock_bh(&msk->join_list_lock);
+       list_add_tail(&subflow->node, &msk->join_list);
+       spin_unlock_bh(&msk->join_list_lock);
+
+       return err;
+
+failed:
+       sock_release(sf);
+       return err;
+}
+
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 {
        struct mptcp_subflow_context *subflow;
@@ -934,6 +1079,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
                new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->idsn = subflow_req->idsn;
        } else if (subflow_req->mp_join) {
+               new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->mp_join = 1;
                new_ctx->fully_established = 1;
                new_ctx->backup = subflow_req->backup;