rm->data.m_sg, rm->data.m_nents,
DMA_TO_DEVICE);
- if (rm->rdma.m_rdma_op) {
- rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+ if (rm->rdma.m_rdma_op.r_active) {
+ rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
*/
rds_ib_send_rdma_complete(rm, wc_status);
- if (rm->rdma.m_rdma_op->r_write)
- rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+ if (rm->rdma.m_rdma_op.r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
else
- rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+ rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
}
/* If anyone waited for this message to get flushed out, wake
rm = rds_send_get_message(conn, send->s_op);
if (rm) {
- if (rm->rdma.m_rdma_op)
- rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+ if (rm->rdma.m_rdma_op.r_active)
+ rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
rds_ib_send_rdma_complete(rm, wc.status);
rds_message_put(rm);
}
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.m_rdma_op) {
+ if (rm->rdma.m_rdma_op.r_active) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+ if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
send_flags = IB_SEND_FENCE;
/*
rm->data.m_sg, rm->data.m_nents,
DMA_TO_DEVICE);
- if (rm->rdma.m_rdma_op) {
- rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+ if (rm->rdma.m_rdma_op.r_active) {
+ rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
*/
rds_iw_send_rdma_complete(rm, wc_status);
- if (rm->rdma.m_rdma_op->r_write)
- rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+ if (rm->rdma.m_rdma_op.r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
else
- rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+ rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
}
/* If anyone waited for this message to get flushed out, wake
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.m_rdma_op) {
+ if (rm->rdma.m_rdma_op.r_active) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+ if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
send_flags = IB_SEND_FENCE;
/*
}
rm->data.m_nents = 0;
- if (rm->rdma.m_rdma_op)
- rds_rdma_free_op(rm->rdma.m_rdma_op);
+ if (rm->rdma.m_rdma_op.r_active)
+ rds_rdma_free_op(&rm->rdma.m_rdma_op);
if (rm->rdma.m_rdma_mr)
rds_mr_put(rm->rdma.m_rdma_mr);
}
{
struct rds_message *rm;
unsigned int i;
+ int num_sgs = ceil(total_len, PAGE_SIZE);
+ int extra_bytes = num_sgs * sizeof(struct scatterlist);
- rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+ rm = rds_message_alloc(extra_bytes, GFP_KERNEL);
if (!rm)
return ERR_PTR(-ENOMEM);
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
rm->data.m_nents = ceil(total_len, PAGE_SIZE);
+ rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs);
for (i = 0; i < rm->data.m_nents; ++i) {
sg_set_page(&rm->data.m_sg[i],
}
kfree(ro->r_notifier);
- kfree(ro);
+ ro->r_notifier = NULL;
+ ro->r_active = 0;
+}
+
+/*
+ * Count the number of pages needed to describe an incoming iovec.
+ */
+static int rds_rdma_pages(struct rds_rdma_args *args)
+{
+ struct rds_iovec vec;
+ struct rds_iovec __user *local_vec;
+ unsigned int tot_pages = 0;
+ unsigned int nr_pages;
+ unsigned int i;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec)))
+ return -EFAULT;
+
+ nr_pages = rds_pages_in_vec(&vec);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+ }
+
+ return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+ return rds_rdma_pages(args) * sizeof(struct scatterlist);
}
/*
* args is a pointer to an in-kernel copy in the sendmsg cmsg.
*/
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
- struct rds_rdma_args *args)
+static int rds_rdma_prepare(struct rds_message *rm,
+ struct rds_sock *rs,
+ struct rds_rdma_args *args)
{
struct rds_iovec vec;
- struct rds_rdma_op *op = NULL;
+ struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
unsigned int nr_pages;
- unsigned int max_pages;
unsigned int nr_bytes;
struct page **pages = NULL;
struct rds_iovec __user *local_vec;
- struct scatterlist *sg;
unsigned int nr;
unsigned int i, j;
- int ret;
+ int ret = 0;
if (rs->rs_bound_addr == 0) {
goto out;
}
- nr_pages = 0;
- max_pages = 0;
-
- local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
- /* figure out the number of pages in the vector */
- for (i = 0; i < args->nr_local; i++) {
- if (copy_from_user(&vec, &local_vec[i],
- sizeof(struct rds_iovec))) {
- ret = -EFAULT;
- goto out;
- }
-
- nr = rds_pages_in_vec(&vec);
- if (nr == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- max_pages = max(nr, max_pages);
- nr_pages += nr;
- }
-
- pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
+ nr_pages = rds_rdma_pages(args);
+ if (nr_pages < 0)
goto out;
- }
- op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
- if (!op) {
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
ret = -ENOMEM;
goto out;
}
+ op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->r_active = 1;
op->r_recverr = rs->rs_recverr;
WARN_ON(!nr_pages);
sg_init_table(op->r_sg, nr_pages);
(unsigned long long)args->remote_vec.addr,
op->r_key);
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i],
sizeof(struct rds_iovec))) {
rs->rs_user_addr = vec.addr;
rs->rs_user_bytes = vec.bytes;
- /* did the user change the vec under us? */
- if (nr > max_pages || op->r_nents + nr > nr_pages) {
- ret = -EINVAL;
- goto out;
- }
/* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing.
*/
for (j = 0; j < nr; j++) {
unsigned int offset = vec.addr & ~PAGE_MASK;
+ struct scatterlist *sg;
sg = &op->r_sg[op->r_nents + j];
sg_set_page(sg, pages[j],
ret = 0;
out:
kfree(pages);
- if (ret) {
- if (op)
- rds_rdma_free_op(op);
- op = ERR_PTR(ret);
- }
- return op;
+ if (ret)
+ rds_rdma_free_op(op);
+
+ return ret;
}
/*
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
- struct rds_rdma_op *op;
+ int ret;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
- rm->rdma.m_rdma_op)
+ rm->rdma.m_rdma_op.r_active)
return -EINVAL;
- op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
- if (IS_ERR(op))
- return PTR_ERR(op);
+ ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg));
+ if (ret)
+ return ret;
+
rds_stats_inc(s_send_rdma);
- rm->rdma.m_rdma_op = op;
return 0;
}
rds_rdma_cookie_t m_rdma_cookie;
struct {
struct {
- struct rds_rdma_op *m_rdma_op;
+ struct rds_rdma_op m_rdma_op;
struct rds_mr *m_rdma_mr;
} rdma;
struct {
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->rdma.m_rdma_op &&
+ if (rm->rdma.m_rdma_op.r_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
spin_lock_irqsave(&conn->c_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
* keep this simple and require that the transport either
* send the whole rdma or none of it.
*/
- if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) {
- ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op);
+ if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) {
+ ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op);
if (ret)
break;
conn->c_xmit_rdma_sent = 1;
spin_lock_irqsave(&rm->m_rs_lock, flags);
- ro = rm->rdma.m_rdma_op;
+ ro = &rm->rdma.m_rdma_op;
if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
- ro && ro->r_notify && ro->r_notifier) {
+ ro->r_active && ro->r_notify && ro->r_notifier) {
notifier = ro->r_notifier;
rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs));
{
struct rds_rdma_op *ro;
- ro = rm->rdma.m_rdma_op;
- if (ro && ro->r_notify && ro->r_notifier) {
+ ro = &rm->rdma.m_rdma_op;
+ if (ro->r_active && ro->r_notify && ro->r_notifier) {
ro->r_notifier->n_status = status;
list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
ro->r_notifier = NULL;
spin_lock_irqsave(&conn->c_lock, flags);
list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (rm->rdma.m_rdma_op == op) {
+ if (&rm->rdma.m_rdma_op == op) {
atomic_inc(&rm->m_refcount);
found = rm;
goto out;
}
list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (rm->rdma.m_rdma_op == op) {
+ if (&rm->rdma.m_rdma_op == op) {
atomic_inc(&rm->m_refcount);
found = rm;
break;
spin_lock(&rs->rs_lock);
if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
- struct rds_rdma_op *ro = rm->rdma.m_rdma_op;
+ struct rds_rdma_op *ro = &rm->rdma.m_rdma_op;
struct rds_notifier *notifier;
list_del_init(&rm->m_sock_item);
rds_send_sndbuf_remove(rs, rm);
- if (ro && ro->r_notifier && (status || ro->r_notify)) {
+ if (ro->r_active && ro->r_notifier &&
+ (status || ro->r_notify)) {
notifier = ro->r_notifier;
list_add_tail(¬ifier->n_list,
&rs->rs_notify_queue);
if (!notifier->n_status)
notifier->n_status = status;
- rm->rdma.m_rdma_op->r_notifier = NULL;
+ rm->rdma.m_rdma_op.r_notifier = NULL;
}
was_on_sock = 1;
rm->m_rs = NULL;
*/
static int rds_rm_size(struct msghdr *msg, int data_len)
{
+ struct cmsghdr *cmsg;
int size = 0;
+ int retval;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+ if (retval < 0)
+ return retval;
+ size += retval;
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ case RDS_CMSG_RDMA_MAP:
+ /* these are valid but do no add any size */
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ }
- size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+ size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
return size;
}
if (ret)
goto out;
- if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) &&
+ if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) &&
!conn->c_trans->xmit_rdma) {
if (printk_ratelimit())
printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
- rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+ &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
ret = -EOPNOTSUPP;
goto out;
}