return &conf->stripe_hashtbl[hash];
}
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+ return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_lock_irq(conf->hash_locks + hash);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_unlock(&conf->device_lock);
+ spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ local_irq_disable();
+ spin_lock(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ spin_unlock(&conf->device_lock);
+ for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+ spin_unlock(conf->hash_locks + i - 1);
+ local_irq_enable();
+}
+
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
}
}
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
{
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
atomic_dec(&conf->active_stripes);
- if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
- list_add_tail(&sh->lru, &conf->inactive_list);
- wake_up(&conf->wait_for_stripe);
- if (conf->retry_read_aligned)
- md_wakeup_thread(conf->mddev->thread);
- }
+ if (!test_bit(STRIPE_EXPANDING, &sh->state))
+ list_add_tail(&sh->lru, temp_inactive_list);
}
}
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
{
if (atomic_dec_and_test(&sh->count))
- do_release_stripe(conf, sh);
+ do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list,
+ int hash)
+{
+ int size;
+ bool do_wakeup = false;
+ unsigned long flags;
+
+ if (hash == NR_STRIPE_HASH_LOCKS) {
+ size = NR_STRIPE_HASH_LOCKS;
+ hash = NR_STRIPE_HASH_LOCKS - 1;
+ } else
+ size = 1;
+ while (size) {
+ struct list_head *list = &temp_inactive_list[size - 1];
+
+ /*
+ * We don't hold any lock here yet, get_active_stripe() might
+ * remove stripes from the list
+ */
+ if (!list_empty_careful(list)) {
+ spin_lock_irqsave(conf->hash_locks + hash, flags);
+ list_splice_tail_init(list, conf->inactive_list + hash);
+ do_wakeup = true;
+ spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+ }
+ size--;
+ hash--;
+ }
+
+ if (do_wakeup) {
+ wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
+ }
}
static struct llist_node *llist_reverse_order(struct llist_node *head)
}
/* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
{
struct stripe_head *sh;
int count = 0;
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
while (head) {
+ int hash;
+
sh = llist_entry(head, struct stripe_head, release_list);
head = llist_next(head);
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
* again, the count is always > 1. This is true for
* STRIPE_ON_UNPLUG_LIST bit too.
*/
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
count++;
}
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
+ struct list_head list;
+ int hash;
bool wakeup;
if (unlikely(!conf->mddev->thread) ||
local_irq_save(flags);
/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
- do_release_stripe(conf, sh);
+ INIT_LIST_HEAD(&list);
+ hash = sh->hash_lock_index;
+ do_release_stripe(conf, sh, &list);
spin_unlock(&conf->device_lock);
+ release_inactive_stripe_list(conf, &list, hash);
}
local_irq_restore(flags);
}
/* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh = NULL;
struct list_head *first;
- if (list_empty(&conf->inactive_list))
+ if (list_empty(conf->inactive_list + hash))
goto out;
- first = conf->inactive_list.next;
+ first = (conf->inactive_list + hash)->next;
sh = list_entry(first, struct stripe_head, lru);
list_del_init(first);
remove_hash(sh);
atomic_inc(&conf->active_stripes);
+ BUG_ON(hash != sh->hash_lock_index);
out:
return sh;
}
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
{
struct r5conf *conf = sh->raid_conf;
- int i;
+ int i, seq;
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
(unsigned long long)sh->sector);
remove_hash(sh);
-
+retry:
+ seq = read_seqcount_begin(&conf->gen_lock);
sh->generation = conf->generation - previous;
sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
sh->sector = sector;
dev->flags = 0;
raid5_build_block(sh, i, previous);
}
+ if (read_seqcount_retry(&conf->gen_lock, seq))
+ goto retry;
insert_hash(conf, sh);
sh->cpu = smp_processor_id();
}
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
+ int hash = stripe_hash_locks_hash(sector);
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
- spin_lock_irq(&conf->device_lock);
+ spin_lock_irq(conf->hash_locks + hash);
do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0 || noquiesce,
- conf->device_lock);
+ *(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
- sh = get_free_stripe(conf);
+ sh = get_free_stripe(conf, hash);
if (noblock && sh == NULL)
break;
if (!sh) {
conf->inactive_blocked = 1;
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list) &&
- (atomic_read(&conf->active_stripes)
- < (conf->max_nr_stripes *3/4)
- || !conf->inactive_blocked),
- conf->device_lock);
+ wait_event_lock_irq(
+ conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash) &&
+ (atomic_read(&conf->active_stripes)
+ < (conf->max_nr_stripes * 3 / 4)
+ || !conf->inactive_blocked),
+ *(conf->hash_locks + hash));
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
&& !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
&& !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
} else {
+ spin_lock(&conf->device_lock);
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
if (list_empty(&sh->lru) &&
+ !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
!test_bit(STRIPE_EXPANDING, &sh->state))
BUG();
list_del_init(&sh->lru);
sh->group->stripes_cnt--;
sh->group = NULL;
}
+ spin_unlock(&conf->device_lock);
}
}
} while (sh == NULL);
if (sh)
atomic_inc(&sh->count);
- spin_unlock_irq(&conf->device_lock);
+ spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
put_cpu();
}
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh;
sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
kmem_cache_free(conf->slab_cache, sh);
return 0;
}
+ sh->hash_lock_index = hash;
/* we just created an active stripe so... */
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
{
struct kmem_cache *sc;
int devs = max(conf->raid_disks, conf->previous_raid_disks);
+ int hash;
if (conf->mddev->gendisk)
sprintf(conf->cache_name[0],
return 1;
conf->slab_cache = sc;
conf->pool_size = devs;
- while (num--)
- if (!grow_one_stripe(conf))
+ hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+ while (num--) {
+ if (!grow_one_stripe(conf, hash))
return 1;
+ conf->max_nr_stripes++;
+ hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+ }
return 0;
}
int err;
struct kmem_cache *sc;
int i;
+ int hash, cnt;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
* OK, we have enough stripes, start collecting inactive
* stripes and copying them over
*/
+ hash = 0;
+ cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
- spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list),
- conf->device_lock);
- osh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
+ lock_device_hash_lock(conf, hash);
+ wait_event_cmd(conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash),
+ unlock_device_hash_lock(conf, hash),
+ lock_device_hash_lock(conf, hash));
+ osh = get_free_stripe(conf, hash);
+ unlock_device_hash_lock(conf, hash);
atomic_set(&nsh->count, 1);
for(i=0; i<conf->pool_size; i++)
nsh->dev[i].page = osh->dev[i].page;
for( ; i<newsize; i++)
nsh->dev[i].page = NULL;
+ nsh->hash_lock_index = hash;
kmem_cache_free(conf->slab_cache, osh);
+ cnt++;
+ if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+ !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+ hash++;
+ cnt = 0;
+ }
}
kmem_cache_destroy(conf->slab_cache);
return err;
}
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh;
- spin_lock_irq(&conf->device_lock);
- sh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
+ spin_lock_irq(conf->hash_locks + hash);
+ sh = get_free_stripe(conf, hash);
+ spin_unlock_irq(conf->hash_locks + hash);
if (!sh)
return 0;
BUG_ON(atomic_read(&sh->count));
static void shrink_stripes(struct r5conf *conf)
{
- while (drop_one_stripe(conf))
- ;
+ int hash;
+ for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+ while (drop_one_stripe(conf, hash))
+ ;
if (conf->slab_cache)
kmem_cache_destroy(conf->slab_cache);
}
}
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
{
/* device_lock is held */
struct list_head head;
list_del_init(&conf->bitmap_list);
while (!list_empty(&head)) {
struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ int hash;
list_del_init(&sh->lru);
atomic_inc(&sh->count);
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
}
}
return 1;
if (conf->quiesce)
return 1;
- if (list_empty_careful(&conf->inactive_list))
+ if (atomic_read(&conf->active_stripes) == conf->max_nr_stripes)
return 1;
return 0;
struct raid5_plug_cb {
struct blk_plug_cb cb;
struct list_head list;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
};
static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
struct mddev *mddev = cb->cb.data;
struct r5conf *conf = mddev->private;
int cnt = 0;
+ int hash;
if (cb->list.next && !list_empty(&cb->list)) {
spin_lock_irq(&conf->device_lock);
* STRIPE_ON_RELEASE_LIST could be set here. In that
* case, the count is always > 1 here
*/
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
cnt++;
}
spin_unlock_irq(&conf->device_lock);
}
+ release_inactive_stripe_list(conf, cb->temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
if (mddev->queue)
trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
cb = container_of(blk_cb, struct raid5_plug_cb, cb);
- if (cb->list.next == NULL)
+ if (cb->list.next == NULL) {
+ int i;
INIT_LIST_HEAD(&cb->list);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(cb->temp_inactive_list + i);
+ }
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
}
static int handle_active_stripes(struct r5conf *conf, int group,
- struct r5worker *worker)
+ struct r5worker *worker,
+ struct list_head *temp_inactive_list)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
- int i, batch_size = 0;
+ int i, batch_size = 0, hash;
+ bool release_inactive = false;
while (batch_size < MAX_STRIPE_BATCH &&
(sh = __get_priority_stripe(conf, group)) != NULL)
batch[batch_size++] = sh;
- if (batch_size == 0)
- return batch_size;
+ if (batch_size == 0) {
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ if (!list_empty(temp_inactive_list + i))
+ break;
+ if (i == NR_STRIPE_HASH_LOCKS)
+ return batch_size;
+ release_inactive = true;
+ }
spin_unlock_irq(&conf->device_lock);
+ release_inactive_stripe_list(conf, temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
+
+ if (release_inactive) {
+ spin_lock_irq(&conf->device_lock);
+ return 0;
+ }
+
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
cond_resched();
spin_lock_irq(&conf->device_lock);
- for (i = 0; i < batch_size; i++)
- __release_stripe(conf, batch[i]);
+ for (i = 0; i < batch_size; i++) {
+ hash = batch[i]->hash_lock_index;
+ __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+ }
return batch_size;
}
while (1) {
int batch_size, released;
- released = release_stripe_list(conf);
+ released = release_stripe_list(conf, worker->temp_inactive_list);
- batch_size = handle_active_stripes(conf, group_id, worker);
+ batch_size = handle_active_stripes(conf, group_id, worker,
+ worker->temp_inactive_list);
worker->working = false;
if (!batch_size && !released)
break;
struct bio *bio;
int batch_size, released;
- released = release_stripe_list(conf);
+ released = release_stripe_list(conf, conf->temp_inactive_list);
if (
!list_empty(&conf->bitmap_list)) {
bitmap_unplug(mddev->bitmap);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
- activate_bit_delay(conf);
+ activate_bit_delay(conf, conf->temp_inactive_list);
}
raid5_activate_delayed(conf);
handled++;
}
- batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+ batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+ conf->temp_inactive_list);
if (!batch_size && !released)
break;
handled += batch_size;
{
struct r5conf *conf = mddev->private;
int err;
+ int hash;
if (size <= 16 || size > 32768)
return -EINVAL;
+ hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
while (size < conf->max_nr_stripes) {
- if (drop_one_stripe(conf))
+ if (drop_one_stripe(conf, hash))
conf->max_nr_stripes--;
else
break;
+ hash--;
+ if (hash < 0)
+ hash = NR_STRIPE_HASH_LOCKS - 1;
}
err = md_allow_write(mddev);
if (err)
return err;
+ hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
while (size > conf->max_nr_stripes) {
- if (grow_one_stripe(conf))
+ if (grow_one_stripe(conf, hash))
conf->max_nr_stripes++;
else break;
+ hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
}
return 0;
}
static int alloc_thread_groups(struct r5conf *conf, int cnt)
{
- int i, j;
+ int i, j, k;
ssize_t size;
struct r5worker *workers;
group->workers = workers + i * cnt;
for (j = 0; j < cnt; j++) {
- group->workers[j].group = group;
- INIT_WORK(&group->workers[j].work, raid5_do_work);
+ struct r5worker *worker = group->workers + j;
+ worker->group = group;
+ INIT_WORK(&worker->work, raid5_do_work);
+
+ for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+ INIT_LIST_HEAD(worker->temp_inactive_list + k);
}
}
struct md_rdev *rdev;
struct disk_info *disk;
char pers_name[6];
+ int i;
if (mddev->new_level != 5
&& mddev->new_level != 4
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- INIT_LIST_HEAD(&conf->inactive_list);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
goto abort;
+ /* We init hash_locks[0] separately to that it can be used
+ * as the reference lock in the spin_lock_nest_lock() call
+ * in lock_all_device_hash_locks_irq in order to convince
+ * lockdep that we know what we are doing.
+ */
+ spin_lock_init(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_init(conf->hash_locks + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->inactive_list + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
conf->level = mddev->new_level;
if (raid5_alloc_percpu(conf) != 0)
goto abort;
else
conf->max_degraded = 1;
conf->algorithm = mddev->new_layout;
- conf->max_nr_stripes = NR_STRIPES;
conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk_sectors = mddev->chunk_sectors;
memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
- if (grow_stripes(conf, conf->max_nr_stripes)) {
+ if (grow_stripes(conf, NR_STRIPES)) {
printk(KERN_ERR
"md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
break;
case 1: /* stop all writes */
- spin_lock_irq(&conf->device_lock);
+ lock_all_device_hash_locks_irq(conf);
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
*/
conf->quiesce = 2;
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_cmd(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
- conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf),
+ lock_all_device_hash_locks_irq(conf));
conf->quiesce = 1;
- spin_unlock_irq(&conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */
- spin_lock_irq(&conf->device_lock);
+ lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_stripe);
wake_up(&conf->wait_for_overlap);
- spin_unlock_irq(&conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf);
break;
}
}