namespaces: ipc namespaces: implement support for posix msqueues

author Serge E. Hallyn <serue@us.ibm.com>

Tue, 7 Apr 2009 02:01:10 +0000 (19:01 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 7 Apr 2009 15:31:09 +0000 (08:31 -0700)
author Serge E. Hallyn <serue@us.ibm.com>
Tue, 7 Apr 2009 02:01:10 +0000 (19:01 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Apr 2009 15:31:09 +0000 (08:31 -0700)
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h

index 3e6fcacebe8ac24d7995840e1d353f33c53f5df0..3392d50de351536881a4bfeed0bf1bf36bc9e4ca 100644 (file)
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -25,7 +25,7 @@ struct ipc_ids {
  };
  
  struct ipc_namespace {
-       struct kref     kref;
+       atomic_t        count;
         struct ipc_ids  ids[3];
  
         int             sem_ctls[4];
@@ -61,6 +61,7 @@ struct ipc_namespace {
  extern struct ipc_namespace init_ipc_ns;
  extern atomic_t nr_ipc_ns;
  
+extern spinlock_t mq_lock;
  #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
  #define INIT_IPC_NS(ns)                .ns             = &init_ipc_ns,
  #else
@@ -82,18 +83,18 @@ static inline int ipcns_notify(unsigned long l) { return 0; }
  #endif /* CONFIG_SYSVIPC */
  
  #ifdef CONFIG_POSIX_MQUEUE
-extern void mq_init_ns(struct ipc_namespace *ns);
+extern int mq_init_ns(struct ipc_namespace *ns);
  /* default values */
  #define DFLT_QUEUESMAX 256     /* max number of message queues */
  #define DFLT_MSGMAX    10      /* max number of messages in each queue */
  #define HARD_MSGMAX    (131072/sizeof(void *))
  #define DFLT_MSGSIZEMAX 8192   /* max message size */
  #else
-#define mq_init_ns(ns) ((void) 0)
+static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
  #endif
  
  #if defined(CONFIG_IPC_NS)
-extern void free_ipc_ns(struct kref *kref);
+extern void free_ipc_ns(struct ipc_namespace *ns);
  extern struct ipc_namespace *copy_ipcs(unsigned long flags,
                                        struct ipc_namespace *ns);
  extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
@@ -103,14 +104,11 @@ extern void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
  static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
  {
         if (ns)
-               kref_get(&ns->kref);
+               atomic_inc(&ns->count);
         return ns;
  }
  
-static inline void put_ipc_ns(struct ipc_namespace *ns)
-{
-       kref_put(&ns->kref, free_ipc_ns);
-}
+extern void put_ipc_ns(struct ipc_namespace *ns);
  #else
  static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
                 struct ipc_namespace *ns)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c

index a3673a09069a7e71dbda2f3d0b24296a87704f74..c82d7b51ef6801d4c477b59386ca7cfe9c5d0cc5 100644 (file)
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -88,7 +88,6 @@ static const struct file_operations mqueue_file_operations;
  static struct super_operations mqueue_super_ops;
  static void remove_notification(struct mqueue_inode_info *info);
  
-static spinlock_t mq_lock;
  static struct kmem_cache *mqueue_inode_cachep;
  
  static struct ctl_table_header * mq_sysctl_table;
@@ -98,27 +97,30 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
         return container_of(inode, struct mqueue_inode_info, vfs_inode);
  }
  
-void mq_init_ns(struct ipc_namespace *ns)
+/*
+ * This routine should be called with the mq_lock held.
+ */
+static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
  {
-       ns->mq_queues_count  = 0;
-       ns->mq_queues_max    = DFLT_QUEUESMAX;
-       ns->mq_msg_max       = DFLT_MSGMAX;
-       ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
-       ns->mq_mnt           = mntget(init_ipc_ns.mq_mnt);
+       return get_ipc_ns(inode->i_sb->s_fs_info);
  }
  
-void mq_exit_ns(struct ipc_namespace *ns)
+static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
  {
-       /* will need to clear out ns->mq_mnt->mnt_sb->s_fs_info here */
-       mntput(ns->mq_mnt);
+       struct ipc_namespace *ns;
+
+       spin_lock(&mq_lock);
+       ns = __get_ns_from_inode(inode);
+       spin_unlock(&mq_lock);
+       return ns;
  }
  
-static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
-                                                       struct mq_attr *attr)
+static struct inode *mqueue_get_inode(struct super_block *sb,
+               struct ipc_namespace *ipc_ns, int mode,
+               struct mq_attr *attr)
  {
         struct user_struct *u = current_user();
         struct inode *inode;
-       struct ipc_namespace *ipc_ns = &init_ipc_ns;
  
         inode = new_inode(sb);
         if (inode) {
@@ -193,30 +195,38 @@ out_inode:
  static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
  {
         struct inode *inode;
+       struct ipc_namespace *ns = data;
+       int error = 0;
  
         sb->s_blocksize = PAGE_CACHE_SIZE;
         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
         sb->s_magic = MQUEUE_MAGIC;
         sb->s_op = &mqueue_super_ops;
  
-       inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
-       if (!inode)
-               return -ENOMEM;
+       inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
+                               NULL);
+       if (!inode) {
+               error = -ENOMEM;
+               goto out;
+       }
  
         sb->s_root = d_alloc_root(inode);
         if (!sb->s_root) {
                 iput(inode);
-               return -ENOMEM;
+               error = -ENOMEM;
         }
  
-       return 0;
+out:
+       return error;
  }
  
  static int mqueue_get_sb(struct file_system_type *fs_type,
                          int flags, const char *dev_name,
                          void *data, struct vfsmount *mnt)
  {
-       return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
+       if (!(flags & MS_KERNMOUNT))
+               data = current->nsproxy->ipc_ns;
+       return get_sb_ns(fs_type, flags, data, mqueue_fill_super, mnt);
  }
  
  static void init_once(void *foo)
@@ -247,12 +257,13 @@ static void mqueue_delete_inode(struct inode *inode)
         struct user_struct *user;
         unsigned long mq_bytes;
         int i;
-       struct ipc_namespace *ipc_ns = &init_ipc_ns;
+       struct ipc_namespace *ipc_ns;
  
         if (S_ISDIR(inode->i_mode)) {
                 clear_inode(inode);
                 return;
         }
+       ipc_ns = get_ns_from_inode(inode);
         info = MQUEUE_I(inode);
         spin_lock(&info->lock);
         for (i = 0; i < info->attr.mq_curmsgs; i++)
@@ -268,10 +279,19 @@ static void mqueue_delete_inode(struct inode *inode)
         if (user) {
                 spin_lock(&mq_lock);
                 user->mq_bytes -= mq_bytes;
-               ipc_ns->mq_queues_count--;
+               /*
+                * get_ns_from_inode() ensures that the
+                * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
+                * to which we now hold a reference, or it is NULL.
+                * We can't put it here under mq_lock, though.
+                */
+               if (ipc_ns)
+                       ipc_ns->mq_queues_count--;
                 spin_unlock(&mq_lock);
                 free_uid(user);
         }
+       if (ipc_ns)
+               put_ipc_ns(ipc_ns);
  }
  
  static int mqueue_create(struct inode *dir, struct dentry *dentry,
@@ -280,9 +300,14 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
         struct inode *inode;
         struct mq_attr *attr = dentry->d_fsdata;
         int error;
-       struct ipc_namespace *ipc_ns = &init_ipc_ns;
+       struct ipc_namespace *ipc_ns;
  
         spin_lock(&mq_lock);
+       ipc_ns = __get_ns_from_inode(dir);
+       if (!ipc_ns) {
+               error = -EACCES;
+               goto out_unlock;
+       }
         if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
                         !capable(CAP_SYS_RESOURCE)) {
                 error = -ENOSPC;
@@ -291,7 +316,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
         ipc_ns->mq_queues_count++;
         spin_unlock(&mq_lock);
  
-       inode = mqueue_get_inode(dir->i_sb, mode, attr);
+       inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
         if (!inode) {
                 error = -ENOMEM;
                 spin_lock(&mq_lock);
@@ -299,6 +324,7 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
                 goto out_unlock;
         }
  
+       put_ipc_ns(ipc_ns);
         dir->i_size += DIRENT_SIZE;
         dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
  
@@ -307,6 +333,8 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
         return 0;
  out_unlock:
         spin_unlock(&mq_lock);
+       if (ipc_ns)
+               put_ipc_ns(ipc_ns);
         return error;
  }
  
@@ -668,7 +696,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
         char *name;
         struct mq_attr attr;
         int fd, error;
-       struct ipc_namespace *ipc_ns = &init_ipc_ns;
+       struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
  
         if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
                 return -EFAULT;
@@ -738,7 +766,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
         char *name;
         struct dentry *dentry;
         struct inode *inode = NULL;
-       struct ipc_namespace *ipc_ns = &init_ipc_ns;
+       struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
  
         name = getname(u_name);
         if (IS_ERR(name))
@@ -1217,6 +1245,32 @@ static struct file_system_type mqueue_fs_type = {
         .kill_sb = kill_litter_super,
  };
  
+int mq_init_ns(struct ipc_namespace *ns)
+{
+       ns->mq_queues_count  = 0;
+       ns->mq_queues_max    = DFLT_QUEUESMAX;
+       ns->mq_msg_max       = DFLT_MSGMAX;
+       ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
+
+       ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
+       if (IS_ERR(ns->mq_mnt)) {
+               int err = PTR_ERR(ns->mq_mnt);
+               ns->mq_mnt = NULL;
+               return err;
+       }
+       return 0;
+}
+
+void mq_clear_sbinfo(struct ipc_namespace *ns)
+{
+       ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+}
+
+void mq_put_mnt(struct ipc_namespace *ns)
+{
+       mntput(ns->mq_mnt);
+}
+
  static int msg_max_limit_min = MIN_MSGMAX;
  static int msg_max_limit_max = MAX_MSGMAX;
  
@@ -1288,15 +1342,14 @@ static int __init init_mqueue_fs(void)
         if (error)
                 goto out_sysctl;
  
-       init_ipc_ns.mq_mnt = kern_mount(&mqueue_fs_type);
+       spin_lock_init(&mq_lock);
+
+       init_ipc_ns.mq_mnt = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
         if (IS_ERR(init_ipc_ns.mq_mnt)) {
                 error = PTR_ERR(init_ipc_ns.mq_mnt);
                 goto out_filesystem;
         }
  
-       /* internal initialization - not common for vfs */
-       spin_lock_init(&mq_lock);
-
         return 0;
  
  out_filesystem:
diff --git a/ipc/msgutil.c b/ipc/msgutil.c

index 73c316cb8613a4dd8ad0c1bd0709eec95bfb6bbc..f095ee26883333fcc69d7c32e907a8f1949e2f3f 100644 (file)
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,19 +18,16 @@
  
  #include "util.h"
  
+DEFINE_SPINLOCK(mq_lock);
+
  /*
   * The next 2 defines are here bc this is the only file
   * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
   * and not CONFIG_IPC_NS.
   */
  struct ipc_namespace init_ipc_ns = {
-       .kref = {
-               /* It's not for this patch to change, but should this be 1? */
-               .refcount       = ATOMIC_INIT(2),
-       },
+       .count          = ATOMIC_INIT(1),
  #ifdef CONFIG_POSIX_MQUEUE
-       .mq_mnt          = NULL,
-       .mq_queues_count = 0,
         .mq_queues_max   = DFLT_QUEUESMAX,
         .mq_msg_max      = DFLT_MSGMAX,
         .mq_msgsize_max  = DFLT_MSGSIZEMAX,
diff --git a/ipc/namespace.c b/ipc/namespace.c

index 4b4dc6d847f18c2129a09d1dfe1e92e24053dfed..4a5e752a92766d6e578c5c94499bdd71e60cbaf5 100644 (file)
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,23 +9,31 @@
  #include <linux/rcupdate.h>
  #include <linux/nsproxy.h>
  #include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
  
  #include "util.h"
  
  static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
  {
         struct ipc_namespace *ns;
+       int err;
  
         ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
         if (ns == NULL)
                 return ERR_PTR(-ENOMEM);
  
+       atomic_set(&ns->count, 1);
+       err = mq_init_ns(ns);
+       if (err) {
+               kfree(ns);
+               return ERR_PTR(err);
+       }
         atomic_inc(&nr_ipc_ns);
  
         sem_init_ns(ns);
         msg_init_ns(ns);
         shm_init_ns(ns);
-       mq_init_ns(ns);
  
         /*
          * msgmni has already been computed for the new ipc ns.
@@ -35,7 +43,6 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
         ipcns_notify(IPCNS_CREATED);
         register_ipcns_notifier(ns);
  
-       kref_init(&ns->kref);
         return ns;
  }
  
@@ -85,11 +92,34 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
         up_write(&ids->rw_mutex);
  }
  
-void free_ipc_ns(struct kref *kref)
+/*
+ * put_ipc_ns - drop a reference to an ipc namespace.
+ * @ns: the namespace to put
+ *
+ * If this is the last task in the namespace exiting, and
+ * it is dropping the refcount to 0, then it can race with
+ * a task in another ipc namespace but in a mounts namespace
+ * which has this ipcns's mqueuefs mounted, doing some action
+ * with one of the mqueuefs files.  That can raise the refcount.
+ * So dropping the refcount, and raising the refcount when
+ * accessing it through the VFS, are protected with mq_lock.
+ *
+ * (Clearly, a task raising the refcount on its own ipc_ns
+ * needn't take mq_lock since it can't race with the last task
+ * in the ipcns exiting).
+ */
+void put_ipc_ns(struct ipc_namespace *ns)
  {
-       struct ipc_namespace *ns;
+       if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+               mq_clear_sbinfo(ns);
+               spin_unlock(&mq_lock);
+               mq_put_mnt(ns);
+               free_ipc_ns(ns);
+       }
+}
  
-       ns = container_of(kref, struct ipc_namespace, kref);
+void free_ipc_ns(struct ipc_namespace *ns)
+{
         /*
          * Unregistering the hotplug notifier at the beginning guarantees
          * that the ipc namespace won't be freed while we are inside the
@@ -102,7 +132,6 @@ void free_ipc_ns(struct kref *kref)
         sem_exit_ns(ns);
         msg_exit_ns(ns);
         shm_exit_ns(ns);
-       mq_exit_ns(ns);
         kfree(ns);
         atomic_dec(&nr_ipc_ns);
  
diff --git a/ipc/util.h b/ipc/util.h

index 0e7d9223acc18ba75585b52a5ddb6f71ace95897..1187332a89d2e54143b953dc1b63af829b9ee592 100644 (file)
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -21,9 +21,11 @@ void shm_init (void);
  struct ipc_namespace;
  
  #ifdef CONFIG_POSIX_MQUEUE
-void mq_exit_ns(struct ipc_namespace *ns);
+extern void mq_clear_sbinfo(struct ipc_namespace *ns);
+extern void mq_put_mnt(struct ipc_namespace *ns);
  #else
-static inline void mq_exit_ns(struct ipc_namespace *ns) { }
+static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
+static inline void mq_put_mnt(struct ipc_namespace *ns) { }
  #endif
  
  #ifdef CONFIG_SYSVIPC
author	Serge E. Hallyn <serue@us.ibm.com>
	Tue, 7 Apr 2009 02:01:10 +0000 (19:01 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 7 Apr 2009 15:31:09 +0000 (08:31 -0700)
include/linux/ipc_namespace.h		patch \| blob \| history
ipc/mqueue.c		patch \| blob \| history
ipc/msgutil.c		patch \| blob \| history
ipc/namespace.c		patch \| blob \| history
ipc/util.h		patch \| blob \| history