0001: 
0002: /*
0003:  * Copyright (C) 2011 Tudor Marian <tudorm@cs.cornell.edu> (see
0004:  * LICENSE file)
0005:  *
0006:  *
0007:  * Gecko Log Structured Storage System (LSSS, or LS3)
0008:  *
0009:  * The implementation consists of a device-mapper kernel module.
0010:  *
0011:  *
0012:  * The Gecko LS3 overlays a log abstraction over a multi-disk RAID
0013:  * matrix.  All writes are appends to the log. Reads are returned from
0014:  * the locations on disk which may be anywhere in the log. To save
0015:  * power, the redundant disks may be turned off, trading off read
0016:  * throughput for power savings.  For example, in a RAID-1
0017:  * configuration, all `mirror' disks (but the disk that belongs to the
0018:  * log head, hence the one that is being actively written) can be
0019:  * turned off---reads are returned from the `primary' disks. The
0020:  * scheme generalizes to RAID-5 and RAID-6 respectively. Furthermore,
0021:  * RAID-5 is superseded by RAID-4, for which the parity disk(s) are no
0022:  * longer a bottleneck due to the append-only nature of the writes
0023:  * hitting the log.
0024:  *
0025:  * Since this is a prototype, it does not yet implement live-RAID
0026:  * recovery.  Ideally, Gecko could be a simple linear layer on top of
0027:  * the conventional RAID implementations, however, additional
0028:  * functionality should be available between the layers in order to
0029:  * maintain efficiency, e.g. the Gecko LS3 layer must be able to put
0030:  * the disks of the underlying array in various power modes, and to
0031:  * direct reads to the powered-on mirror disks.
0032:  */
0033: 
0034: #include <asm/atomic.h>
0035: #include <asm/uaccess.h>
0036: #include <asm/unistd.h>
0037: #include <linux/bio.h>
0038: #include <linux/blkdev.h>
0039: #include <linux/device-mapper.h>
0040: #include <linux/dm-io.h>
0041: #include <linux/fcntl.h>
0042: #include <linux/file.h>
0043: #include <linux/fs.h>
0044: #include <linux/hash.h>
0045: #include <linux/highmem.h>
0046: #include <linux/hrtimer.h>
0047: #include <linux/init.h>
0048: #include <linux/kernel.h>
0049: #include <linux/kthread.h>
0050: #include <linux/list.h>
0051: #include <linux/mempool.h>
0052: #include <linux/module.h>
0053: #include <linux/moduleparam.h>
0054: #include <linux/pagemap.h>
0055: #include <linux/percpu.h>
0056: #include <linux/random.h>
0057: #include <linux/rwsem.h>
0058: #include <linux/slab.h>
0059: #include <linux/spinlock.h>
0060: #include <linux/syscalls.h>
0061: #include <linux/version.h>
0062: #include <linux/workqueue.h>
0063: #include <scsi/sg.h>
0064: 
0065: #include "dmg-kcopyd.h"
0066: 
0067: //#define ALWAYS_RUN_GC
0068: #define DROP_WRITE_WRITE_CLASH_OPTIMIZATION
0069: //#define SYS_RENAME_EXPORTED_TO_MODULES
0070: 
0071: //#define MAX_DETAIL_LOG_LOOP_CNT 0xffffffff
0072: #define MAX_DETAIL_LOG_LOOP_CNT 8
0073: 
0074: #define MIN_JOBS_IN_POOL        512
0075: #define DM_GECKO_GC_COPY_PAGES  512
0076: #define DM_GECKO_MAX_STRIPES    DMG_KCOPYD_MAX_REGIONS
0077: #define MIN_GC_CONCURRENT_REQ   4
0078: #define GC_CONCURRENT_REQ       64
0079: #define MAX_GC_CONCURRENT_REQ   DM_GECKO_GC_COPY_PAGES
0080: 
0081: #define GC_DEFAULT_LOW_WATERMARK         0
0082: #define GC_DEFAULT_HIGH_WATERMARK        3
0083: #define DM_GECKO_CRITICAL_WATERMARK      1024
0084: #define DM_GECKO_CRITICAL_WATERMARK_HARD 8
0085: 
0086: #define DM_GECKO_DEBUG 0
0087: #define DM_GECKO_PREFIX "dm-gecko: "
0088: #if DM_GECKO_DEBUG
0089: #define DPRINTK( s, arg... ) printk(DM_GECKO_PREFIX s "\n", ##arg)
0090: #else
0091: #define DPRINTK( s, arg... )
0092: #endif
0093: 
0094: #define GECKO_TIMER_PERIOD_SECS         (1)
0095: #define GECKO_TIMER_PERIOD_NSECS        (0)
0096: 
0097: #define GECKO_BLOCK_SHIFT PAGE_SHIFT        /* blocks the size of pages */
0098: #define GECKO_BLOCK_SIZE (1UL << GECKO_BLOCK_SHIFT)
0099: 
0100: #define GECKO_SECTOR_TO_BLOCK_SHIFT (GECKO_BLOCK_SHIFT - SECTOR_SHIFT)
0101: #define GECKO_SECTOR_TO_BLOCK_MASK ((1UL << GECKO_SECTOR_TO_BLOCK_SHIFT) - 1)
0102: #define GECKO_SECTORS_PER_BLOCK (1UL << GECKO_SECTOR_TO_BLOCK_SHIFT)
0103: 
0104: static inline sector_t sector_to_block(sector_t sector)
0105: {
0106:         return (sector >> GECKO_SECTOR_TO_BLOCK_SHIFT);
0107: }
0108: 
0109: static inline sector_t block_to_sector(sector_t sector)
0110: {
0111:         return (sector << GECKO_SECTOR_TO_BLOCK_SHIFT);
0112: }
0113: 
0114: static inline int sector_at_block_boundary(sector_t sector)
0115: {
0116:         return ((sector & GECKO_SECTOR_TO_BLOCK_MASK) == 0x0);
0117: }
0118: 
0119: static inline int bio_start_at_block_boundary(struct bio *bio)
0120: {
0121:         return sector_at_block_boundary(bio->bi_sector);
0122: }
0123: 
0124: static inline int bio_end_at_block_boundary(struct bio *bio)
0125: {
0126:         return sector_at_block_boundary(bio->bi_sector +
0127:                                         to_sector(bio->bi_size));
0128: }
0129: 
0130: static inline int bio_at_block_boundary(struct bio *bio)
0131: {
0132:         return bio_start_at_block_boundary(bio)
0133:             && bio_end_at_block_boundary(bio);
0134: }
0135: 
0136: static inline int bio_single_block_at_block_boundary(struct bio *bio)
0137: {
0138:         return (bio->bi_size == GECKO_BLOCK_SIZE)
0139:             && bio_at_block_boundary(bio);
0140: }
0141: 
0142: enum seg_power_state {
0143:         unspecified,
0144:         active,      /* normal active/idle operation mode */
0145:         standby,     /* low power mode, drive has spun down */
0146:         sleep,       /* lowest power mode, drive is completely shut down */
0147: };
0148: 
0149: #define DEFAULT_LOW_POW_STATE standby
0150: 
0151: struct dm_gecko;
0152: 
0153: struct dm_dev_seg {
0154:         struct list_head list;
0155:         int idx;          /* only used for debugging purposes */
0156:         sector_t start;   /* offset in sectors */
0157:         sector_t len;     /* len in sectors */
0158:         struct dm_dev *dev[DM_GECKO_MAX_STRIPES];
0159:         struct work_struct work;
0160:         enum seg_power_state cur_pow_state, next_pow_state;
0161:         unsigned long long access_seq_in_log;
0162:         atomic_t pending_writes;
0163:         struct dm_gecko *ctxt;
0164: };
0165: 
0166: enum dm_gecko_layout { linear, raid0, raid1, raid5, raid6 };
0167: 
0168: struct phy_disk_map {
0169:         sector_t len;                /* total linear length in sectors */
0170:         enum dm_gecko_layout layout;
0171:         int stripes;
0172:         int cnt;
0173:         struct list_head dm_dev_segs;
0174: };
0175: 
0176: /* Hashtable for pending IO operations indexed by block number */
0177: #define HASH_TABLE_BITS 12
0178: #define HASH_TABLE_SIZE (1UL << HASH_TABLE_BITS)
0179: 
0180: struct dm_gecko_stats {
0181:         unsigned long long reads, subblock_reads, writes, subblock_writes,
0182:             gc, discards, dropped_discards, empty_barriers, gc_recycle,
0183:             rw_clash, rw_gc_clash, gc_clash, gc_rw_clash, ww_clash,
0184:             read_empty, read_err, write_err, kcopyd_err, sb_read, sb_write;
0185: };
0186: 
0187: struct gc_ctrl {
0188:         u32 low_watermark;
0189:         u32 high_watermark;
0190: };
0191: 
0192: enum {  // dm_gecko->flags bit positions
0193:         DM_GECKO_GC_FORCE_STOP,
0194:         DM_GECKO_FINAL_SYNC_METADATA,
0195:         DM_GECKO_GC_STARTED,
0196:         DM_GECKO_READ_TPUT,
0197:         DM_GECKO_INDEPENDENT_GC,
0198:         DM_GECKO_STATUS_DETAILED,
0199:         DM_GECKO_SYNCING_METADATA,
0200: };
0201: 
0202: struct dm_gecko {
0203:         spinlock_t lock;
0204:         atomic_t total_jobs;        /* used to safely destroy the target */
0205:         struct list_head *buckets;
0206:         int htable_size;
0207:         u32 *d_map;
0208:         u32 *r_map;
0209:         u32 tail;
0210:         u32 persistent_tail;
0211:         u32 head;
0212:         u32 size;                /* size of the maps in number of blocks */
0213:         /* free blocks that can be used w/o reclaiming initialized to
0214:            ->size.  Circular ring logic dictates that available_blocks
0215:            must be > 1 */
0216:         u32 persistent_available_blocks;
0217:         u32 available_blocks;
0218:         u32 free_blocks;        /* total number of free blocks */
0219:         struct dm_dev_seg *head_seg;
0220:         struct dm_dev_seg *tail_seg;
0221:         volatile unsigned long flags;
0222:         int gc_req_in_progress;
0223:         int max_gc_req_in_progress;
0224:         struct phy_disk_map disk_map;
0225:         struct dmg_kcopyd_client *kcopyd_client;
0226:         struct dm_io_client *io_client;
0227:         struct dm_gecko_stats *stats;
0228:         struct gc_ctrl gc_ctrl;
0229:         struct hrtimer timer;
0230:         ktime_t timer_delay;
0231:         struct work_struct gc_work;
0232:         atomic_t timer_active;
0233:         atomic_t gc_work_scheduled_by_timer;
0234:         struct work_struct sync_metadata_work;
0235:         wait_queue_head_t jobs_pending_waitqueue;
0236:         wait_queue_head_t no_free_space_waitqueue;
0237:         struct rw_semaphore metadata_sync_sema;
0238:         enum seg_power_state low_pow_state;
0239:         unsigned long long incarnation;
0240:         unsigned long tail_wrap_around;
0241:         unsigned long head_wrap_around;
0242:         char *meta_filename;
0243: };
0244: 
0245: struct dm_gecko_dev {
0246:         char name[16];  // for the persistent metadata representation
0247: };
0248: 
0249: /* TUDORICA, the diminutive form of my name (Tudor) in Romanian ;) */
0250: #define DM_GECKO_META_MAGIC (0x2D0031CA)
0251: 
0252: /* the persistent dm_gecko starts w/ this header on disk, followed by
0253:    all the disk_map_cnt device names of (struct dm_gecko_dev) type,
0254:    and by block maps */
0255: struct dm_gecko_persistent_metadata {
0256:         unsigned long long incarnation;
0257:         u32 magic;
0258:         u32 size;
0259:         u32 tail;
0260:         u32 head;
0261:         u32 available_blocks;
0262:         u32 free_blocks;
0263:         unsigned long flags;
0264:         int max_gc_req_in_progress;
0265:         enum dm_gecko_layout layout;
0266:         int stripes;
0267:         int disk_map_cnt;
0268:         struct gc_ctrl gc_ctrl;
0269:         enum seg_power_state low_pow_state;
0270: };
0271: 
0272: struct io_for_block {
0273:         sector_t key;
0274:         int rw_cnt;                     /* number of IOs in progress. If
0275:                                          * negative, the gc is running */
0276: #define WRITE_CLASH_IO_FOR_BLOCK 0
0277:         volatile unsigned long flags;   /* for write_clash optimization */
0278:         struct list_head hashtable;     /* chain into the hash table */
0279:         struct list_head pending_io;    /* list of IOs in progress */
0280:         struct list_head deferred_io;   /* list of deferred IOs */
0281: };
0282: 
0283: struct io_job {
0284:         struct list_head list;
0285:         struct dm_gecko *dmg;
0286:         int rw;                        /* READ or WRITE */
0287:         struct io_for_block *parent;   /* if NULL, the job is deferred */
0288:         void *page;                    /* for read-modify-update cycles */
0289:         struct bio *bio;               /* if NULL this is a gc IO */
0290:         sector_t v_block;              /* virtual block */
0291:         sector_t l_block;              /* linear block */
0292: };
0293: 
0294: static inline int sector_in_seg(sector_t sector, struct dm_dev_seg *seg)
0295: {
0296:         return (sector >= seg->start) && (sector < seg->start + seg->len);
0297: }
0298: 
0299: static struct dm_dev_seg *seg_for_sector(struct dm_gecko *dmg,
0300:                                                 sector_t sector)
0301: {
0302:         struct dm_dev_seg *seg;
0303:         list_for_each_entry(seg, &dmg->disk_map.dm_dev_segs, list) {
0304:                 if (sector < seg->start + seg->len) {
0305:                         return seg;
0306:                 }
0307:         }
0308:         return NULL;
0309: }
0310: 
0311: static inline void linear_to_phy_raid0(struct dm_gecko *dmg,
0312:                                        struct dm_dev_seg *seg,
0313:                                        sector_t sector,
0314:                                        struct dm_io_region *where)
0315: {
0316:         int stripe;
0317:         sector_t block;
0318:         sector -= seg->start;
0319:         block = sector_to_block(sector);
0320:         // The do_div is a macro that updates @block with the
0321:         // quotient and returns the remainder. Use block instead
0322:         // of the sector, since all sectors, being block aligned,
0323:         // are even, so the stripe is always 0.
0324:         stripe = do_div(block, dmg->disk_map.stripes);
0325:         where->bdev = seg->dev[stripe]->bdev;
0326:         where->sector = block_to_sector(block);
0327:         where->count = GECKO_SECTORS_PER_BLOCK;
0328: }
0329: 
0330: static struct dm_dev_seg *linear_to_phy_all(struct dm_gecko *dmg,
0331:                                             sector_t sector,
0332:                                             struct dm_io_region *where,
0333:                                             int *num_regions)
0334: {
0335:         int i;
0336:         struct dm_dev_seg *seg = seg_for_sector(dmg, sector);
0337: 
0338:         BUG_ON(!seg);  /* must fit in the range somewhere */
0339: 
0340:         if (dmg->disk_map.layout == raid0) {
0341:                 linear_to_phy_raid0(dmg, seg, sector, &where[0]);
0342:                 *num_regions = 1;
0343:         } else {
0344:                 for (i = 0; i < dmg->disk_map.stripes; i++) {
0345:                         where[i].bdev = seg->dev[i]->bdev;
0346:                         where[i].sector = sector - seg->start;
0347:                         where[i].count = GECKO_SECTORS_PER_BLOCK;
0348:                 }
0349:                 *num_regions = dmg->disk_map.stripes;
0350:         }
0351:         return seg;
0352: }
0353: 
0354: static struct dm_dev_seg *linear_to_phy_which(struct dm_gecko *dmg,
0355:                                               sector_t sector,
0356:                                               unsigned which,
0357:                                               struct dm_io_region *where)
0358: {
0359:         struct dm_dev_seg *seg = seg_for_sector(dmg, sector);
0360: 
0361:         BUG_ON(!seg);  /* must fit in the range somewhere */
0362:         BUG_ON(which >= dmg->disk_map.stripes);
0363: 
0364:         if (dmg->disk_map.layout == raid0) {
0365:                 linear_to_phy_raid0(dmg, seg, sector, where);
0366:         } else {
0367:                 where->bdev = seg->dev[which]->bdev;
0368:                 where->sector = sector - seg->start;
0369:                 where->count = GECKO_SECTORS_PER_BLOCK;
0370:         }
0371:         return seg;
0372: }
0373: 
0374: static inline u32 mark_block_free(struct dm_gecko *dmg)
0375: {
0376:         return dmg->size;
0377: }
0378: 
0379: static inline int is_block_marked_free(u32 block, struct dm_gecko *dmg)
0380: {
0381:         return (block == dmg->size);
0382: }
0383: 
0384: static inline int is_block_invalid(u32 block, struct dm_gecko *dmg)
0385: {
0386:         return (block > dmg->size);
0387: }
0388: 
0389: static inline int is_block_free_or_invalid(u32 block, struct dm_gecko *dmg)
0390: {
0391:         return (block >= dmg->size);
0392: }
0393: 
0394: static inline int __no_available_blocks(struct dm_gecko *dmg)
0395: {
0396:         /* can be less than the watermark temporarily while gc runs */
0397:         return (dmg->available_blocks <= DM_GECKO_CRITICAL_WATERMARK);
0398: }
0399: 
0400: static inline int __no_available_blocks_hard(struct dm_gecko *dmg)
0401: {
0402:         return (dmg->available_blocks <= DM_GECKO_CRITICAL_WATERMARK_HARD);
0403: }
0404: 
0405: /* used by all dm-gecko targets */
0406: static DEFINE_SPINLOCK(jobs_lock);
0407: /* the workqueue picks up items off this list */
0408: static LIST_HEAD(deferred_jobs);
0409: 
0410: /* mempool cache allocators */
0411: static struct kmem_cache *io_for_block_cache, *io_job_cache;
0412: static mempool_t *io_for_block_mempool, *io_job_mempool;
0413: 
0414: /* Deferred work and work that needs a task context executes on this
0415:  * workqueue. Must be singlethreaded. */
0416: static struct workqueue_struct *gecko_wqueue = NULL;
0417: static struct work_struct gecko_work;
0418: /* This workqueue is used only to sync the metadata from a
0419: task-context. Trying to use the same gecko_wqueue for this operation
0420: would render the deadlock avoidance logic unnecessarily complicated. */
0421: static struct workqueue_struct *gecko_sync_metadata_wqueue = NULL;
0422: 
0423: struct deferred_stats {
0424:         unsigned long long gc, rw, total;
0425: };
0426: DEFINE_PER_CPU(struct deferred_stats, deferred_stats);
0427: 
0428: static  void do_complete_generic(struct dm_gecko *dmg)
0429: {
0430:         if (atomic_dec_and_test(&dmg->total_jobs)) {
0431:                 wake_up(&dmg->jobs_pending_waitqueue);
0432:         }
0433: }
0434: 
0435: static void do_run_gc(struct io_job *io);
0436: static void map_rw_io_job(struct io_job *io);
0437: 
0438: static inline void wake_deferred_wqueue(void)
0439: {
0440:         queue_work(gecko_wqueue, &gecko_work);
0441: }
0442: 
0443: static inline int io_job_is_deferred(struct io_job *io)
0444: {
0445:         return (io->parent == NULL);
0446: }
0447: 
0448: static inline void set_io_job_deferred(struct io_job *io)
0449: {
0450:         io->parent = NULL;
0451: }
0452: 
0453: static inline int io_job_is_gc(struct io_job *io)
0454: {
0455:         return (io->bio == NULL);
0456: }
0457: 
0458: static inline void set_io_job_gc(struct io_job *io)
0459: {
0460:         io->bio = NULL;
0461: }
0462: 
0463: static inline void __add_deferred_io_job(struct io_job *io)
0464: {
0465:         set_io_job_deferred(io);
0466:         list_add_tail(&io->list, &deferred_jobs);
0467: }
0468: 
0469: static void queue_deferred_io_job(struct io_job *io)
0470: {
0471:         unsigned long flags;
0472:         struct deferred_stats *def_stats;
0473: 
0474:         spin_lock_irqsave(&jobs_lock, flags);
0475:         def_stats = &__get_cpu_var(deferred_stats);
0476:         __add_deferred_io_job(io);
0477:         ++def_stats->total;
0478:         spin_unlock_irqrestore(&jobs_lock, flags);
0479: 
0480:         wake_deferred_wqueue();
0481: }
0482: 
0483: /* The only entry point into the gc; can be called from interrupt context */
0484: static void wake_gc(struct io_job *io)
0485: {
0486:         set_io_job_gc(io);
0487:         io->page = NULL;
0488:         queue_deferred_io_job(io);
0489: }
0490: 
0491: /* Runs on the global workqueue, serialized w/ the IO completion
0492:  * work_structs since the workqueue is singlethreaded. */
0493: static void try_sched_gc(struct work_struct *work)
0494: {
0495:         struct dm_gecko *dmg = container_of(work, struct dm_gecko, gc_work);
0496:         struct io_job *io;
0497:         int i;
0498: 
0499:         // Optimistic estimate of the # of gc requests that can be
0500:         // issued --- read the dmg->gc_req_in_progress without holding
0501:         // the dmg->lock.
0502:         int gc_requests = (dmg->max_gc_req_in_progress -
0503:                            dmg->gc_req_in_progress);
0504:         if (gc_requests < 1) {
0505:           gc_requests = 1;
0506:         }
0507: 
0508:         for (i = 0; i < gc_requests; ++i) {
0509:           atomic_inc(&dmg->total_jobs);
0510:           io = mempool_alloc(io_job_mempool, GFP_NOIO);
0511:           io->dmg = dmg;
0512:           wake_gc(io);
0513:         }
0514:         atomic_set(&dmg->gc_work_scheduled_by_timer, 0);
0515: }
0516: 
0517: /* this executes in irq context; can't mempool_alloc w/ the GFP_NOIO
0518:  * flag */
0519: static enum hrtimer_restart fire_gc_timer(struct hrtimer *timer)
0520: {
0521:         struct dm_gecko *dmg = container_of(timer, struct dm_gecko, timer);
0522: 
0523:         if (!atomic_read(&dmg->timer_active)) {
0524:                 return HRTIMER_NORESTART;
0525:         }
0526:         if (atomic_cmpxchg(&dmg->gc_work_scheduled_by_timer, 0, 1) == 0) {
0527:                 queue_work(gecko_wqueue, &dmg->gc_work);
0528:         }
0529:         hrtimer_forward_now(timer, dmg->timer_delay);
0530:         return HRTIMER_RESTART;
0531: }
0532: 
0533: // TODO(tudorm): make this work for raid1 with # of stripes > 2
0534: // and DM_GECKO_INDEPENDENT_GC / DM_GECKO_READ_TPUT;
0535: static int default_read_stripe_for_layout(enum dm_gecko_layout layout) {
0536:   int stripe = 0;
0537:   switch(layout) {
0538:   case linear:
0539:   case raid0:
0540:   case raid1:
0541:           stripe = 0;
0542:           break;
0543:   case raid5:
0544:   case raid6:
0545:   default:
0546:           printk(DM_GECKO_PREFIX "unimplemented layout\n");
0547:           BUG_ON(1);
0548:           break;
0549:   }
0550:   return stripe;
0551: }
0552: 
0553: static int default_gc_stripe_for_layout(enum dm_gecko_layout layout) {
0554:   int stripe = 0;
0555:   switch(layout) {
0556:   case linear:
0557:   case raid0:
0558:           stripe = 0;
0559:           break;
0560:   case raid1:
0561:           stripe = 1;
0562:           break;
0563:   case raid5:
0564:   case raid6:
0565:   default:
0566:           printk(DM_GECKO_PREFIX "unimplemented layout\n");
0567:           BUG_ON(1);
0568:           break;
0569:   }
0570:   return stripe;
0571: }
0572: 
0573: static inline int choose_load_balanced_stripe(struct dm_gecko *dmg)
0574: {
0575:         /* load balance using the per-CPU counter for READs =>
0576:          * sloppy counter */
0577:         unsigned long long sloppy_read_cnt;
0578:         get_cpu();
0579:         /* can't just use reads, must also use the gc events,
0580:          * otherwise, after a period of inactivity, when only
0581:          * the gc runs, the sloppy_read_cnt remains the same,
0582:          * thus all gc read requests will hit the same disk */
0583:         sloppy_read_cnt = (this_cpu_ptr(dmg->stats))->reads +
0584:                 this_cpu_ptr(dmg->stats)->gc;
0585:         put_cpu();
0586:         return do_div(sloppy_read_cnt, dmg->disk_map.stripes);
0587: }
0588: 
0589: static int choose_read_stripe(sector_t sector, struct dm_gecko *dmg)
0590: {
0591:         int stripe = default_read_stripe_for_layout(dmg->disk_map.layout);
0592:         if (sector_in_seg(sector, dmg->head_seg)) {
0593:                 return choose_load_balanced_stripe(dmg);
0594:         } else if (sector_in_seg(sector, dmg->tail_seg) &&
0595:                    test_bit(DM_GECKO_INDEPENDENT_GC, &dmg->flags)) {
0596:                 return stripe;
0597:         }
0598:         // fall through
0599:         if (test_bit(DM_GECKO_READ_TPUT, &dmg->flags)) {
0600:                 return choose_load_balanced_stripe(dmg);
0601:         } else {
0602:                 return stripe;
0603:         }
0604: }
0605: 
0606: static int choose_gc_stripe(sector_t sector, struct dm_gecko *dmg)
0607: {
0608:         if (sector_in_seg(sector, dmg->head_seg)) {
0609:                 return choose_load_balanced_stripe(dmg);
0610:         } else if (test_bit(DM_GECKO_INDEPENDENT_GC, &dmg->flags)) {
0611:                 // TODO(tudorm): BUG_ON if not on the tail segment.
0612:                 return default_gc_stripe_for_layout(dmg->disk_map.layout);
0613:         } else {
0614:                 if (test_bit(DM_GECKO_READ_TPUT, &dmg->flags)) {
0615:                         return choose_load_balanced_stripe(dmg);
0616:                 } else {
0617:                         // yes, READ stripe!
0618:                         return default_read_stripe_for_layout(
0619:                                 dmg->disk_map.layout);
0620:                 }
0621:         }
0622: }
0623: 
0624: // straced hdparm and used the following symbols from its source
0625: #define SG_ATA_16             0x85
0626: #define SG_ATA_16_LEN         16
0627: #define ATA_USING_LBA         (1 << 6)
0628: #define ATA_OP_SLEEPNOW1      0xe6
0629: #define ATA_OP_SLEEPNOW2      0x99
0630: #define ATA_OP_STANDBYNOW1    0xe0
0631: #define ATA_OP_STANDBYNOW2    0x94
0632: #define ATA_OP_SETIDLE        0xe3
0633: #define SG_ATA_PROTO_NON_DATA (3 << 1)
0634: #define SG_CDB2_CHECK_COND    (1 << 5)
0635: 
0636: static void prep_SG_ATA_cmd_block(unsigned char *cmd_block,
0637:                                   enum seg_power_state pow_state)
0638: {
0639:         BUG_ON(pow_state == unspecified);
0640:         cmd_block[0] = SG_ATA_16;
0641:         cmd_block[1] = SG_ATA_PROTO_NON_DATA;
0642:         cmd_block[2] = SG_CDB2_CHECK_COND;
0643:         cmd_block[13] = ATA_USING_LBA;
0644:         switch (pow_state) {
0645:         case active:
0646:                 cmd_block[6] = 0;        // set the delay to 0
0647:                 cmd_block[14] = ATA_OP_SETIDLE;
0648:                 break;
0649:         case standby:
0650:                 cmd_block[14] = ATA_OP_STANDBYNOW1;
0651:                 break;
0652:         case sleep:
0653:                 cmd_block[14] = ATA_OP_SLEEPNOW1;
0654:                 break;
0655:         default:
0656:                 BUG_ON(1);
0657:         }
0658: }
0659: 
0660: /* Put device into active, standby, or sleep mode. If the device is
0661:  * put into lowest power sleep mode, it will be shut down
0662:  * completely. A reset is required before the drive can be accessed
0663:  * again, and the Linux IDE driver should automatically issue the
0664:  * reset on demand (tested on a 2.6.35 kernel and it does indeed
0665:  * automatically issue the reset). */
0666: static void set_drive_power(struct block_device *bdev,
0667:                             enum seg_power_state pow_state)
0668: {
0669:         mm_segment_t old_fs = get_fs();
0670:         struct gendisk *disk = bdev->bd_disk;
0671:         struct sg_io_hdr hdr;
0672:         unsigned char sense_b[32];
0673:         unsigned char cmd_block[SG_ATA_16_LEN];
0674:         int err;
0675: 
0676:         memset(&hdr, 0, sizeof(hdr));
0677:         memset(&sense_b, 0, sizeof(sense_b));
0678:         memset(cmd_block, 0, sizeof(cmd_block));
0679:         prep_SG_ATA_cmd_block((unsigned char *)&cmd_block, pow_state);
0680: 
0681:         hdr.interface_id = SG_INTERFACE_ID_ORIG;
0682:         hdr.dxfer_direction = SG_DXFER_NONE;
0683:         hdr.cmd_len = sizeof(cmd_block);
0684:         hdr.mx_sb_len = sizeof(sense_b);
0685:         hdr.sbp = sense_b;
0686:         hdr.cmdp = cmd_block;
0687:         hdr.timeout = 10000;        // timeout in milliseconds
0688: 
0689:         set_fs(KERNEL_DS);
0690:         err = blkdev_ioctl(bdev, 0, SG_IO, (unsigned long)&hdr);
0691:         if (err) {
0692:                 printk(DM_GECKO_PREFIX "sg_io error %d on %s\n", err,
0693:                        disk->disk_name);
0694:         } else {
0695:                 printk(DM_GECKO_PREFIX
0696:                        "set /dev/%s drive power state to %s\n",
0697:                        disk->disk_name,
0698:                        pow_state ==
0699:                        active ? "active" : ((pow_state == standby) ?
0700:                                             "standby" : "sleep"));
0701:         }
0702:         set_fs(old_fs);
0703: }
0704: 
0705: static void run_dm_dev_seg(struct work_struct *work)
0706: {
0707:         struct dm_dev_seg *seg = container_of(work, struct dm_dev_seg, work);
0708:         struct dm_gecko *dmg = seg->ctxt;
0709: 
0710:         if (dmg->disk_map.layout != raid1 ||
0711:             seg->next_pow_state == seg->cur_pow_state ||
0712:             test_bit(DM_GECKO_READ_TPUT, &dmg->flags) ||
0713:             seg == dmg->head_seg ||
0714:             (seg == dmg->tail_seg &&
0715:             test_bit(DM_GECKO_INDEPENDENT_GC, &dmg->flags))) {
0716:                 goto out_reset_next_pow_state;
0717:         }
0718: 
0719:         if (seg->next_pow_state == standby || seg->next_pow_state == sleep) {
0720:                 int i, err;
0721:                 for (i = 0; i < dmg->disk_map.stripes; i++) {
0722:                 /* blocking flush while on workqueue's task context,
0723:                  * hence will block deferred IO or gc events scheduled
0724:                  * on same workqueue. This flush is not necessary
0725:                  * if this was called as a result of tail advancing. */
0726:                         err = blkdev_issue_flush(seg->dev[i]->bdev, GFP_KERNEL,
0727: #if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 36)
0728:                                                  NULL, BLKDEV_IFL_WAIT);
0729: #else
0730:                                                  NULL);
0731: #endif
0732:                         if (err) {
0733:                                 printk(DM_GECKO_PREFIX "disk flush failed "
0734:                                        "with status %d\n", err);
0735:                         }
0736:                         if (i != default_read_stripe_for_layout(
0737:                                 dmg->disk_map.layout)) {
0738:                                 set_drive_power(seg->dev[i]->bdev,
0739:                                                 seg->next_pow_state);
0740:                         }
0741:                 }
0742:                 seg->cur_pow_state = seg->next_pow_state;
0743:         }
0744: out_reset_next_pow_state:
0745:         seg->next_pow_state = unspecified;
0746: }
0747: 
0748: static int store_dm_gecko(struct dm_gecko *dmg);
0749: 
0750: static void do_sync_metadata(struct dm_gecko *dmg)
0751: {
0752:         unsigned long saved_flags;
0753:         int err = 0;
0754: 
0755:         BUG_ON(in_interrupt());
0756: 
0757:         // Optimization that does not allow two (non-final)
0758:         // metadata-sync operations to proceed at roughly the same
0759:         // time.
0760:         if (test_and_set_bit(DM_GECKO_SYNCING_METADATA, &dmg->flags)) {
0761:                 printk(DM_GECKO_PREFIX "A metadata-sync operation is already "
0762:                        "in progress.\n");
0763:                 return;
0764:         }
0765: 
0766:         down_write(&dmg->metadata_sync_sema);
0767:         saved_flags = dmg->flags;
0768:         // No more new IOs are being submitted from this point on.
0769:         if (test_bit(DM_GECKO_FINAL_SYNC_METADATA, &dmg->flags)) {
0770:                 printk(DM_GECKO_PREFIX "Should not be able to issue a "
0771:                        "metadata-sync operation after target destroy.\n");
0772:                 BUG_ON(true);
0773:         }
0774:         // Turn off the gc.
0775:         set_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags);
0776:         // Turn off the (gc) timer.
0777:         atomic_set(&dmg->timer_active, 0);
0778:         hrtimer_cancel(&dmg->timer);
0779:         // Wait for all pending io jobs (including gc jobs) to finish.
0780:         wait_event(dmg->jobs_pending_waitqueue, !atomic_read(&dmg->total_jobs));
0781: 
0782:         err = store_dm_gecko(dmg);
0783:         if (err) {
0784:                 printk(DM_GECKO_PREFIX "Unable to store gecko metadata\n");
0785:                 goto out;
0786:         }
0787:         dmg->persistent_tail = dmg->tail;
0788:         dmg->persistent_available_blocks = dmg->available_blocks;
0789: 
0790: out:
0791:         dmg->flags = saved_flags;  // Restore other flags.
0792:         clear_bit(DM_GECKO_SYNCING_METADATA, &dmg->flags);
0793:         up_write(&dmg->metadata_sync_sema);
0794: }
0795: 
0796: static void sync_metadata(struct work_struct *work)
0797: {
0798:         struct dm_gecko *dmg = container_of(work,
0799:                                             struct dm_gecko,
0800:                                             sync_metadata_work);
0801:         do_sync_metadata(dmg);
0802: }
0803: 
0804: static void run_deferred_jobs(struct work_struct *unused_work_struct)
0805: {
0806:         unsigned long flags;
0807:         struct deferred_stats *def_stats;
0808: 
0809:         BUG_ON(in_interrupt());
0810: 
0811:         spin_lock_irqsave(&jobs_lock, flags);
0812:         /* preemption disabled under spinlock */
0813:         def_stats = &__get_cpu_var(deferred_stats);
0814:         while (!list_empty(&deferred_jobs)) {
0815:                 struct io_job *io =
0816:                     container_of(deferred_jobs.next, struct io_job, list);
0817:                 list_del(&io->list);
0818:                 --def_stats->total;
0819: 
0820:                 if (io_job_is_gc(io)) {
0821:                         ++def_stats->gc;
0822:                         spin_unlock_irqrestore(&jobs_lock, flags);
0823:                         BUG_ON(!io_job_is_deferred(io));
0824:                         do_run_gc(io);
0825:                 } else {
0826:                         ++def_stats->rw;
0827:                         spin_unlock_irqrestore(&jobs_lock, flags);
0828:                         BUG_ON(!io_job_is_deferred(io));
0829:                         map_rw_io_job(io);
0830:                 }
0831:                 spin_lock_irqsave(&jobs_lock, flags);
0832:                 // May have migrated CPUs so grab a fresh reference.
0833:                 def_stats = &__get_cpu_var(deferred_stats);
0834:         }
0835:         spin_unlock_irqrestore(&jobs_lock, flags);
0836: }
0837: 
0838: /* operation on hash table */
0839: static struct io_for_block *get_io_for_block(struct dm_gecko *dmg,
0840:                                              sector_t key)
0841: {
0842:         struct io_for_block *io4b;
0843: 
0844:         unsigned long bucket_idx = hash_long(key, HASH_TABLE_BITS);
0845:         struct list_head *bucket = &dmg->buckets[bucket_idx];
0846: 
0847:         list_for_each_entry(io4b, bucket, hashtable) {
0848:                 if (io4b->key == key) {
0849:                           return io4b;
0850:                 }
0851:         }
0852:         return NULL;
0853: }
0854: 
0855: /* WARNING: duplicates are not checked for, you have been advised,
0856:  * play nice */
0857: static void put_io_for_block(struct dm_gecko *dmg, u32 key,
0858:                              struct io_for_block *io4b)
0859: {
0860:         unsigned long bucket_idx = hash_long(key, HASH_TABLE_BITS);
0861:         struct list_head *bucket = &dmg->buckets[bucket_idx];
0862: 
0863:         io4b->key = key;
0864:         list_add_tail(&io4b->hashtable, bucket);
0865:         ++dmg->htable_size;
0866: }
0867: 
0868: static void wake_up_free_space_available(struct dm_gecko *dmg)
0869: {
0870:         unsigned long flags;
0871:         spin_lock_irqsave(&dmg->lock, flags);
0872:         __wake_up_locked(&dmg->no_free_space_waitqueue, TASK_NORMAL);
0873:         spin_unlock_irqrestore(&dmg->lock, flags);
0874: }
0875: 
0876: static inline u32 __relocatable_blocks(struct dm_gecko *dmg)
0877: {
0878:         return dmg->free_blocks - dmg->available_blocks;
0879: }
0880: 
0881: static inline u32 __unavailable_blocks(struct dm_gecko *dmg)
0882: {
0883:         return dmg->size - dmg->available_blocks;
0884: }
0885: 
0886: static inline u32 __used_blocks(struct dm_gecko *dmg)
0887: {
0888:         return dmg->size - dmg->free_blocks;
0889: }
0890: 
0891: /* Should probably encode this in a proper DFA */
0892: /* ->lock must be held */
0893: static int __gc_needs_to_run(struct dm_gecko *dmg)
0894: {
0895:         /* TODO: check how many available and free blocks there are,
0896:          * and their ratio use a bunch of watermarks, e.g. when <= 10%
0897:          * contiguous available space, and start the gc. When the
0898:          * reserved block percentage is hit (say 5%) then block the
0899:          * writers. Make sure the gc can continue to make progress */
0900: 
0901:         //u32 used_blocks = __used_blocks(dmg);
0902:         //u32 unavailable_blocks = __unavailable_blocks(dmg);
0903:         u32 max_relocatable_blocks = __relocatable_blocks(dmg);
0904: 
0905: #ifdef ALWAYS_RUN_GC
0906:         return (max_relocatable_blocks != 0);
0907: #endif
0908: 
0909:         if (dmg->gc_req_in_progress >= dmg->max_gc_req_in_progress) {
0910:                 return 0;
0911:         }
0912: 
0913:         if (test_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags)) {
0914:                 return 0;
0915:         }
0916:         /* Return `no need to run gc' early if:
0917:          *      - there are no gaps / holes
0918:          *      - there are no more concurrent gc requests allowed */
0919:         if (max_relocatable_blocks == 0) {
0920:                 clear_bit(DM_GECKO_GC_STARTED, &dmg->flags);
0921:                 return 0;
0922:         }
0923: 
0924:         if (test_bit(DM_GECKO_GC_STARTED, &dmg->flags)) {
0925:                 if (max_relocatable_blocks <= dmg->gc_ctrl.low_watermark) {
0926:                         clear_bit(DM_GECKO_GC_STARTED, &dmg->flags);
0927:                         return 0;
0928:                 } else {
0929:                         return 1;
0930:                 }
0931:         } else {
0932:                 if (max_relocatable_blocks >= dmg->gc_ctrl.high_watermark) {
0933:                         set_bit(DM_GECKO_GC_STARTED, &dmg->flags);
0934:                         return 1;
0935:                 } else {
0936:                         return 0;
0937:                 }
0938:         }
0939: }
0940: 
0941: // the function may be called from irq context, so can't block
0942: static void sched_delayed_power_adjustment_for_segment(struct dm_dev_seg
0943:                                                        *seg,
0944:                                                        enum seg_power_state
0945:                                                        next_pow_state)
0946: {
0947:         struct dm_gecko *dmg = seg->ctxt;
0948:         if (seg == dmg->head_seg || dmg->disk_map.layout != raid1 ||
0949:             (seg == dmg->tail_seg &&
0950:             test_bit(DM_GECKO_INDEPENDENT_GC, &dmg->flags))) {
0951:                 return;
0952:         }
0953:         seg->next_pow_state = next_pow_state;
0954:         queue_work(gecko_wqueue, &seg->work);
0955: }
0956: 
0957: /* Allocate/claim the next contiguously available block for writing or
0958:    gc.  Do not need to check if the circular ring is full, since
0959:    ->available_blocks is consistently updated and it indicates how
0960:    many slots are available */
0961: static  u32 __claim_next_free_block(struct dm_gecko *dmg)
0962: {
0963:         u32 head = dmg->head;
0964:         BUG_ON(!is_block_marked_free(dmg->r_map[head], dmg));
0965:         if ((++dmg->head) == dmg->size) {
0966:                 dmg->head = 0;
0967:                 ++dmg->head_wrap_around;
0968:         }
0969:         --dmg->persistent_available_blocks;
0970:         --dmg->available_blocks;
0971:         --dmg->free_blocks;
0972:         return head;
0973: }
0974: 
0975: /* ->lock must be held */
0976: static u32 __ffwd_tail(struct dm_gecko *dmg)
0977: {
0978:         u32 cnt;
0979:         for (cnt = 0; dmg->tail != dmg->head; ++cnt) {
0980:                 if (!is_block_marked_free(dmg->r_map[dmg->tail], dmg)) {
0981:                         break;
0982:                 }
0983:                 /* can fast forward the tail one slot worth */
0984:                 if ((++dmg->tail) == dmg->size) {
0985:                         dmg->tail = 0;
0986:                         ++dmg->tail_wrap_around;
0987:                 }
0988:                 ++dmg->available_blocks;
0989:         }
0990:         return cnt;
0991: }
0992: 
0993: /* ->lock must be held */
0994: static void __relocate_gc_written_block(struct io_job *io)
0995: {
0996:         struct dm_gecko *dmg = io->dmg;
0997:         u32 old_l_block = dmg->d_map[io->v_block];
0998: 
0999:         BUG_ON(dmg->r_map[old_l_block] != io->v_block);
1000:         BUG_ON(dmg->r_map[io->l_block] != io->v_block);
1001: 
1002:         dmg->r_map[old_l_block] = mark_block_free(dmg);
1003:         ++dmg->free_blocks;
1004:         dmg->d_map[io->v_block] = io->l_block;
1005: }
1006: 
1007: /* ->lock must be held */
1008: static void __relocate_written_block(struct io_job *io)
1009: {
1010:         struct dm_gecko *dmg = io->dmg;
1011: 
1012:         BUG_ON(dmg->r_map[io->l_block] != io->v_block);
1013:         // Since the lock is held while calling this function, the
1014:         // block indexed by io->v_block may be in one of two states
1015:         // only. It may either be marked free, or it may point to a
1016:         // linear block in the reverse map that correctly holds the
1017:         // back link, irrespective of the fact that reads and writes
1018:         // are issued concurrently.
1019:         if (!is_block_marked_free(dmg->d_map[io->v_block], dmg)) {
1020:                 u32 old_l_block = dmg->d_map[io->v_block];
1021:                 BUG_ON(dmg->r_map[old_l_block] != io->v_block);
1022:                 dmg->r_map[old_l_block] = mark_block_free(dmg);
1023:                 ++dmg->free_blocks;
1024:         }
1025:         dmg->d_map[io->v_block] = io->l_block;
1026: }
1027: 
1028: /* lock must be held */
1029: static struct dm_dev_seg *__touch_new_head_seg(struct dm_gecko *dmg, u32 sector)
1030: {
1031:         struct dm_dev_seg *ret = NULL;
1032:         struct dm_dev_seg *seg = seg_for_sector(dmg, block_to_sector(sector));
1033:         int is_last_write = atomic_dec_and_test(&seg->pending_writes);
1034: 
1035:         if (seg != dmg->head_seg && is_last_write) {
1036:                 struct dm_dev_seg *head_seg = dmg->head_seg;
1037:                 BUG_ON(seg->access_seq_in_log == head_seg->access_seq_in_log);
1038:                 if (seg->access_seq_in_log < head_seg->access_seq_in_log) {
1039:                         seg->access_seq_in_log =
1040:                             head_seg->access_seq_in_log + 1;
1041:                         dmg->head_seg = seg;
1042:                         ret = head_seg;
1043:                 }
1044:         }
1045:         return ret;
1046: }
1047: 
1048: static void gc_complete_callback(int read_err, unsigned long write_err,
1049:                                  void *ctxt)
1050: {
1051:         struct io_job *io = (struct io_job *)ctxt;
1052:         struct io_for_block *io4b = io->parent;
1053:         /* hold the reference, io_job may be released early */
1054:         struct dm_gecko *dmg = io->dmg;
1055:         unsigned long flags;
1056:         struct dm_dev_seg *seg = NULL, *old_tail_seg = NULL;
1057:         int keep_running = 0, freed_blocks = 0;
1058: 
1059:         /* TODO: if kcopyd fails, handle the errors as in the IO
1060:          * completion */
1061:         BUG_ON(read_err || write_err);
1062: 
1063:         spin_lock_irqsave(&dmg->lock, flags);
1064: 
1065:         __relocate_gc_written_block(io);
1066:         seg = __touch_new_head_seg(dmg, io->l_block);
1067:         freed_blocks = __ffwd_tail(dmg);  // always after __relocate
1068:         if (freed_blocks) {
1069:                 struct dm_dev_seg *tail_seg = seg_for_sector(dmg,
1070:                         block_to_sector(dmg->tail));
1071:                 if (tail_seg != dmg->tail_seg) {
1072:                         old_tail_seg = dmg->tail_seg;
1073:                         dmg->tail_seg = tail_seg;
1074:                 }
1075:         }
1076:         keep_running = __gc_needs_to_run(dmg);
1077: 
1078:         list_del(&io->list);
1079:         BUG_ON(io4b->rw_cnt >= 0 || (!list_empty(&io4b->pending_io)));
1080: 
1081:         list_del(&io4b->hashtable);
1082:         --dmg->htable_size;
1083:         --dmg->gc_req_in_progress;
1084: 
1085:         spin_unlock_irqrestore(&dmg->lock, flags);
1086: 
1087:         if (!list_empty(&io4b->deferred_io)) {
1088:                 struct io_job *rw_io_job, *tmp;
1089:                 struct deferred_stats *def_stats;
1090: 
1091:                 spin_lock_irqsave(&jobs_lock, flags);
1092:                 def_stats = &__get_cpu_var(deferred_stats);
1093:                 list_for_each_entry_safe(rw_io_job, tmp,
1094:                                          &io4b->deferred_io, list) {
1095:                         list_del(&rw_io_job->list);
1096:                         __add_deferred_io_job(rw_io_job);
1097:                         ++def_stats->total;
1098:                 }
1099:                 spin_unlock_irqrestore(&jobs_lock, flags);
1100:                 wake_deferred_wqueue();
1101:         }
1102: 
1103:         mempool_free(io4b, io_for_block_mempool);
1104:         if (keep_running) {
1105:                 struct dm_gecko_stats *stats;
1106:                 get_cpu();
1107:                 stats = this_cpu_ptr(dmg->stats);
1108:                 ++stats->gc_recycle;
1109:                 put_cpu();
1110:                 /* recycle the io_job, be very careful since may be
1111:                  * in_interrupt() */
1112:                 wake_gc(io);
1113:         } else {
1114:                 mempool_free(io, io_job_mempool);
1115:                 wake_up_free_space_available(dmg);
1116:                 do_complete_generic(dmg);
1117:         }
1118:         if (seg != NULL) {
1119:                 /* seg holds the reference to the old segment */
1120:                 printk(DM_GECKO_PREFIX
1121:                        "transitioning from old seg: %d:%llu to: %d:%llu "
1122:                        "(gc_complete)\n", seg->idx, seg->access_seq_in_log,
1123:                        dmg->head_seg->idx, dmg->head_seg->access_seq_in_log);
1124:                 sched_delayed_power_adjustment_for_segment(seg,
1125:                                                            dmg->low_pow_state);
1126:         }
1127:         if (freed_blocks > 0) {
1128:           wake_up_free_space_available(dmg);
1129:         }
1130:         if (old_tail_seg != NULL) {
1131:                 printk(DM_GECKO_PREFIX "tail segment transition from : %d "
1132:                        "to %d\n", old_tail_seg->idx, dmg->tail_seg->idx);
1133:                 sched_delayed_power_adjustment_for_segment(old_tail_seg,
1134:                                                            dmg->low_pow_state);
1135:         }
1136: }
1137: 
1138: /* Cannot be called from interrupt context */
1139: void gc_complete_read_noirq(int *dst_count,
1140:                             struct dm_io_region *dst,
1141:                             void *context) {
1142:         struct io_job *io = (struct io_job *) context;
1143:         struct dm_gecko *dmg = io->dmg;
1144:         unsigned long flags;
1145:         struct dm_dev_seg* seg;
1146: 
1147:         BUG_ON(in_interrupt());
1148:         BUG_ON(*dst_count > 0);
1149:         BUG_ON(!is_block_marked_free(io->l_block, dmg));
1150: 
1151:         spin_lock_irqsave(&dmg->lock, flags);
1152: 
1153:         if (__no_available_blocks_hard(dmg)) {
1154:                 printk(DM_GECKO_PREFIX "ran out of space.\n");
1155:                 BUG_ON(1);
1156:         }
1157:         io->l_block = __claim_next_free_block(dmg);
1158:         dmg->r_map[io->l_block] = io->v_block;
1159: 
1160:         seg = linear_to_phy_all(dmg, block_to_sector(io->l_block), dst,
1161:                                 dst_count);
1162: 
1163:         atomic_inc(&seg->pending_writes);
1164:         /* TODO: if l_block on next seg, anticipate and power-up */
1165: 
1166:         spin_unlock_irqrestore(&dmg->lock, flags);
1167: }
1168: 
1169: static void dm_dispatch_io_gc(struct io_job *io)
1170: {
1171:         struct dm_gecko *dmg = io->dmg;
1172:         struct dm_io_region src;
1173:         sector_t src_l_block, sector;
1174: 
1175:         /* Need NOT synchonize access to the phy maps. Further, I can
1176:            access the block map !!! to fetch the entry at index
1177:            io->v_block since the hashtable was syncronously updated to
1178:            indicate the gc is scheduled to run on that block and there
1179:            are no concurrent operations on the same block while it is
1180:            relocated by gc. This means that any further operations
1181:            will not touch said block and therefore will not alter the
1182:            map entry at index io->v_block. Be aware that this is NOT
1183:            the case for dm_dispatch_io_bio, since regular reads and
1184:            writes may be issued concurrently. */
1185:         src_l_block = dmg->d_map[io->v_block];
1186:         BUG_ON(dmg->r_map[src_l_block] != io->v_block);
1187:         BUG_ON(!is_block_marked_free(io->l_block, dmg));
1188: 
1189:         sector = block_to_sector(src_l_block);
1190:         linear_to_phy_which(dmg, sector, choose_gc_stripe(sector, dmg), &src);
1191: 
1192:         DPRINTK("Relocating [block:cnt(device-major:device-minor)] "
1193:                 "%llu:%llu(%u:%u)\n",
1194:                 (unsigned long long)sector_to_block(src.sector),
1195:                 (unsigned long long)sector_to_block(src.count),
1196:                 MAJOR(src.bdev->bd_dev), MINOR(src.bdev->bd_dev));
1197: 
1198:         dmg_kcopyd_copy(
1199:             io->dmg->kcopyd_client, &src, 0, NULL, 0,
1200:             (dmg_kcopyd_notify_fn) gc_complete_callback, (void *) io,
1201:             NULL,
1202:             (dmg_kcopyd_notify_readdone_fn_noirq) gc_complete_read_noirq);
1203: }
1204: 
1205: static void do_run_gc(struct io_job *io)
1206: {
1207:         struct dm_gecko *dmg = io->dmg;
1208:         struct dm_gecko_stats *stats;
1209:         unsigned long flags;
1210:         u32 relocated_block;
1211:         int dispatch_io_now = 1, freed_blocks = 0;
1212:         struct io_for_block *io4b, *extant_io4b;
1213: 
1214:         BUG_ON(in_interrupt());
1215:         BUG_ON(!io_job_is_gc(io));
1216: 
1217:         io4b = mempool_alloc(io_for_block_mempool, GFP_NOIO);
1218:         io4b->rw_cnt = -1;
1219:         INIT_LIST_HEAD(&io4b->pending_io);
1220:         INIT_LIST_HEAD(&io4b->deferred_io);
1221:         io->parent = io4b;
1222:         io->l_block = 0;  /* must be marked free before dispatching gc */
1223:         list_add_tail(&io->list, &io4b->pending_io);
1224: 
1225:         spin_lock_irqsave(&dmg->lock, flags);
1226:         /* preemption is disabled under spinlock */
1227:         stats = this_cpu_ptr(dmg->stats);
1228:         freed_blocks = __ffwd_tail(dmg);
1229: 
1230:         if (is_block_marked_free(dmg->r_map[dmg->tail], dmg)) {
1231:                 BUG_ON(dmg->head != dmg->tail);
1232:                 goto out_no_need_to_run;
1233:         }
1234: 
1235:         if (!__gc_needs_to_run(dmg)) {
1236: out_no_need_to_run:
1237:                 spin_unlock_irqrestore(&dmg->lock, flags);
1238:                 mempool_free(io, io_job_mempool);
1239:                 mempool_free(io4b, io_for_block_mempool);
1240:                 do_complete_generic(dmg);
1241:                 goto out_free_blocks;
1242:         }
1243: 
1244:         relocated_block = dmg->tail;
1245:         io->v_block = dmg->r_map[relocated_block];
1246:         BUG_ON(is_block_free_or_invalid(io->v_block, dmg));
1247:         BUG_ON(dmg->d_map[io->v_block] != relocated_block);
1248: 
1249: gc_next_relocated_block:
1250:         extant_io4b = get_io_for_block(dmg, io->v_block);
1251:         if (!extant_io4b) {
1252:                 put_io_for_block(dmg, io->v_block, io4b);
1253:                 io4b = NULL;        /* prevent deallocation */
1254:                 io->l_block = mark_block_free(dmg);
1255:                 ++stats->gc;
1256:                 ++dmg->gc_req_in_progress;
1257:         } else {
1258:                 /* there is IO or gc activity in progress on this
1259:                  * block */
1260:                 BUG_ON(list_empty(&extant_io4b->pending_io));
1261:                 /* there are pending IOs and the gc was not yet
1262:                  * deferred on this block */
1263:                 if (extant_io4b->rw_cnt > 0
1264:                     && list_empty(&extant_io4b->deferred_io)) {
1265:                         ++stats->gc_rw_clash;
1266:                         list_add_tail(&io->list, &extant_io4b->deferred_io);
1267:                         io->parent = extant_io4b;
1268:                 } else {
1269:                         /* the gc is already running or was already
1270:                          * deferred on this block */
1271:                         BUG_ON(extant_io4b->rw_cnt == 0);
1272:                         ++stats->gc_clash;
1273:                         /* must fast forward until the next non-free
1274:                          * block is found */
1275:                         while (relocated_block != dmg->head) {
1276:                                 if ((++relocated_block) == dmg->size) {
1277:                                         /* wrap around */
1278:                                         relocated_block = 0;
1279:                                 }
1280:                                 io->v_block = dmg->r_map[relocated_block];
1281:                                 if (!is_block_marked_free(io->v_block, dmg)) {
1282:                                         BUG_ON(io->parent != io4b);
1283:                                         goto gc_next_relocated_block;
1284:                                 }
1285:                         }
1286:                         io->parent = NULL;
1287:                 }
1288:                 /* must be set here, after the above goto */
1289:                 dispatch_io_now = 0;
1290:         }
1291:         spin_unlock_irqrestore(&dmg->lock, flags);
1292: 
1293:         if (io->parent == NULL) {
1294:                 mempool_free(io, io_job_mempool);
1295:                 do_complete_generic(dmg);
1296:         }
1297:         if (io4b != NULL) {
1298:                 mempool_free(io4b, io_for_block_mempool);
1299:         }
1300:         if (dispatch_io_now) {
1301:                 dm_dispatch_io_gc(io);
1302:         }
1303: out_free_blocks:
1304:         if (freed_blocks > 0) {
1305:                 wake_up_free_space_available(dmg);
1306:         }
1307: }
1308: 
1309: static void memcpy_bio_into_page(struct io_job *io)
1310: {
1311:         int i;
1312:         struct bio_vec *bvec;
1313:         struct bio *bio = io->bio;
1314:         char *addr =
1315:             io->page + to_bytes(bio->bi_sector & GECKO_SECTOR_TO_BLOCK_MASK);
1316:         bio_for_each_segment(bvec, bio, i) {
1317:                 unsigned long flags;
1318:                 /* I wonder if I can use page_address(->bv_page) +
1319:                  * ->bv_offset instead of kmaps. */
1320:                 char *bio_addr = bvec_kmap_irq(bvec, &flags);
1321:                 memcpy(addr, bio_addr, bvec->bv_len);
1322:                 bvec_kunmap_irq(bio_addr, &flags);
1323:                 addr += bvec->bv_len;
1324:         }
1325: }
1326: 
1327: static void memcpy_page_into_bio(struct io_job *io)
1328: {
1329:         int i;
1330:         struct bio_vec *bvec;
1331:         struct bio *bio = io->bio;
1332:         char *addr =
1333:             io->page + to_bytes(bio->bi_sector & GECKO_SECTOR_TO_BLOCK_MASK);
1334:         bio_for_each_segment(bvec, bio, i) {
1335:                 unsigned long flags;
1336:                 char *bio_addr = bvec_kmap_irq(bvec, &flags);
1337:                 memcpy(bio_addr, addr, bvec->bv_len);
1338:                 bvec_kunmap_irq(bio_addr, &flags);
1339:                 addr += bvec->bv_len;
1340:         }
1341: }
1342: 
1343: static void io_complete_callback(unsigned long err, void *context)
1344: {
1345:         struct io_job *io = (struct io_job *)context;
1346:         struct dm_gecko *dmg = io->dmg;
1347:         struct dm_gecko_stats *stats;
1348:         struct dm_dev_seg *seg = NULL;
1349:         struct io_for_block *io4b = io->parent;
1350:         int is_last_io = 0, freed_blocks = 0, run_gc = 0;
1351:         int read_modify_write = 0;
1352:         unsigned long flags;
1353: 
1354:         spin_lock_irqsave(&dmg->lock, flags);
1355:         /* preemption is disabled under spinlock */
1356:         stats = this_cpu_ptr(dmg->stats);
1357:         BUG_ON(io4b->rw_cnt <= 0);
1358:         if (err) {
1359:                 if (io->rw == READ) {
1360:                         zero_fill_bio(io->bio);
1361:                         DPRINTK("read error, returning 0s");
1362:                         ++stats->read_err;
1363:                 } else {
1364:                         ++stats->write_err;
1365:                         // TODO: perhaps keep the older block instead
1366:                         __relocate_written_block(io);
1367:                 }
1368:         } else {
1369:                 if (io->rw == WRITE) {
1370:                         __relocate_written_block(io);
1371:                         seg = __touch_new_head_seg(dmg, io->l_block);
1372:                         freed_blocks = __ffwd_tail(dmg);  // after _relocate
1373:                         run_gc = __gc_needs_to_run(dmg);
1374: #ifdef DROP_WRITE_WRITE_CLASH_OPTIMIZATION
1375:                         clear_bit(WRITE_CLASH_IO_FOR_BLOCK, &io4b->flags);
1376: #endif
1377:                 }
1378:         }
1379:         list_del(&io->list);        /* deleted from io4b->pending_io */
1380:         if ((--io4b->rw_cnt) == 0) {
1381:                 BUG_ON(!list_empty(&io4b->pending_io));
1382:                 list_del(&io4b->hashtable);
1383:                 --dmg->htable_size;
1384:                 is_last_io = 1;
1385:         }
1386:         spin_unlock_irqrestore(&dmg->lock, flags);
1387: 
1388:         if (io->page != NULL) {
1389:                 if (err) {
1390:                         free_page((unsigned long)io->page);
1391:                         io->page = NULL;
1392:                 } else {
1393:                         if (io->rw == READ && bio_data_dir(io->bio) == WRITE) {
1394:                                 memcpy_bio_into_page(io);
1395:                                 io->rw = WRITE;
1396:                                 /* resubmit IO (read-modify-write) */
1397:                                 queue_deferred_io_job(io);
1398:                                 read_modify_write = 1;
1399:                         } else {
1400:                                 if (bio_data_dir(io->bio) == READ) {
1401:                                         memcpy_page_into_bio(io);
1402:                                 }
1403:                                 free_page((unsigned long)io->page);
1404:                                 io->page = NULL;
1405:                         }
1406:                 }
1407:         }
1408:         if (is_last_io) {
1409:                 if (!list_empty(&io4b->deferred_io)) {
1410:                         struct io_job *io_gc =
1411:                             list_entry(io4b->deferred_io.next,
1412:                                        struct io_job, list);
1413:                         list_del(&io_gc->list);
1414:                         set_io_job_deferred(io_gc);
1415:                         /* `there can be only one' deferred gc job per
1416:                          * block */
1417:                         BUG_ON((!list_empty(&io4b->deferred_io))
1418:                                || (!io_job_is_gc(io_gc)));
1419:                         queue_deferred_io_job(io_gc);
1420:                 }
1421:                 mempool_free(io4b, io_for_block_mempool);
1422:         }
1423: 
1424:         if (seg != NULL) {
1425:                 /* seg holds the reference to the old segment. Call
1426:                  * before bio_endio notifies upper layer that the
1427:                  * operation has completed, to ensure that the flush
1428:                  * semaphore is taken so that subsequent explicit
1429:                  * flush requests (should they happen at such an
1430:                  * inopportune time) are serialized w.r.t. the flush
1431:                  * that is issued when writes advanced to a new
1432:                  * segment. */
1433:                 printk(DM_GECKO_PREFIX
1434:                        "transitioning from old seg: %d:%llu to: %d:%llu "
1435:                        "(io_complete)\n", seg->idx, seg->access_seq_in_log,
1436:                        dmg->head_seg->idx, dmg->head_seg->access_seq_in_log);
1437:                 sched_delayed_power_adjustment_for_segment(seg,
1438:                                                            dmg->low_pow_state);
1439:         }
1440:         if (!read_modify_write) {
1441:                 bio_endio(io->bio, err);
1442:                 if (freed_blocks > 0) {
1443:                         wake_up_free_space_available(dmg);
1444:                 }
1445:                 if (run_gc) {
1446:                         wake_gc(io); /* recycle the struct io_job */
1447:                 } else {
1448:                         mempool_free(io, io_job_mempool);
1449:                         do_complete_generic(dmg);
1450:                 }
1451:         }
1452: }
1453: 
1454: /* WARNING: do NOT touch any of the shared state (e.g. the direct and
1455:  * reverse relocation maps) from this function---accessing the members
1456:  * of the io_job passed in is safe, e.g. io->v_block or
1457:  * io->l_block. The context (parameter passed to the callback) is the
1458:  * io_job. */
1459: static int dm_dispatch_io_bio(struct io_job *io)
1460: {
1461:         struct dm_gecko *dmg = io->dmg;
1462:         struct dm_io_request iorq;
1463:         struct dm_io_region where[DM_GECKO_MAX_STRIPES];
1464:         struct bio *bio = io->bio;
1465:         sector_t sector;
1466:         int num_regions = 1;
1467:         int flags;
1468: 
1469:         /* The physical map requires no synchronization since it is
1470:          * initialized once and not altered henceforth. Further, the
1471:          * dm_io_region(s) can be allocated on-stack even though the
1472:          * dm_io is asynchronous since it is used to set the fields of
1473:          * a newly allocated bio (which is itself submitted for io
1474:          * through the submit_bio() interface). WARNING! do not touch
1475:          * the virtual and linear maps since reads and writes may be
1476:          * issued concurrently (that's the contract at the
1477:          * block-level---request ordering is not ensured. */
1478: 
1479:         // the sector is the same for both READ and WRITE
1480:         sector = block_to_sector(io->l_block);
1481:         if (io->rw == READ) {
1482:                 num_regions = 1;
1483:                 linear_to_phy_which(dmg, sector,
1484:                                     choose_read_stripe(sector, dmg), where);
1485:                 DPRINTK("READ <dev %u:%u> sector: %llu count: %llu",
1486:                         MAJOR(where[0].bdev->bd_dev),
1487:                         MINOR(where[0].bdev->bd_dev),
1488:                         (unsigned long long)where[0].sector,
1489:                         (unsigned long long)where[0].count);
1490:         } else {
1491:                 linear_to_phy_all(dmg, sector, where, &num_regions);
1492:                 BUG_ON(num_regions > dmg->disk_map.stripes);
1493:                 DPRINTK
1494:                     ("WRITE <dev %u:%u> sector: %llu count: %llu num_dests: %u",
1495:                      MAJOR(where[0].bdev->bd_dev),
1496:                      MINOR(where[0].bdev->bd_dev),
1497:                      (unsigned long long)where[num_regions - 1].sector,
1498:                      (unsigned long long)where[num_regions - 1].count,
1499:                      num_regions);
1500:         }
1501: 
1502: #if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 24)
1503: #error "Kernel version unsuported (too old)."
1504: #elif LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 28)
1505:         flags = 0; //(1 << BIO_RW_SYNC);
1506: #elif LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 35)
1507:         flags = 0; //(1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
1508: #else
1509:         flags = 0; //(1 | REQ_SYNC | REQ_UNPLUG);
1510: #endif
1511:         iorq.bi_rw = (io->rw | flags);
1512: 
1513:         if (io->page != NULL) {
1514:                 /* unaligned request */
1515:                 iorq.mem.type = DM_IO_KMEM;
1516:                 iorq.mem.ptr.addr = io->page;
1517:                 iorq.mem.offset = 0;  // only required for DM_IO_PAGE_LIST
1518:         } else {
1519:                 iorq.mem.type = DM_IO_BVEC;
1520:                 iorq.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx;
1521:         }
1522:         iorq.notify.fn = io_complete_callback;
1523:         iorq.notify.context = io;
1524:         iorq.client = dmg->io_client;
1525: 
1526:         /* The beauty of the log structure is that I need not maintain
1527:          * consistent ordering of write requests across mirrors, since
1528:          * all writes are performed against fresh blocks and no
1529:          * in-place modifications take place. For a conventional block
1530:          * device, two writes may be issued concurrently (e.g.  by
1531:          * uncooperating processes while bypassing the buffer cache w/
1532:          * O_DIRECT) and while this may be perfectly fine for a block
1533:          * device backed up by a single disk, it may be an issue for a
1534:          * RAID-1 array. For example, dm_io -> async_io -> dispatch_io
1535:          * -> do_region will call submit_bio for every mirror disk,
1536:          * which means that concurrent requests A and B for the same
1537:          * block X mirrored on devices D1 and D2 may be queued to the
1538:          * respective elevators in any order (e.g. A, B for D1 and B,
1539:          * A for D2). This means that a write-write conflict will
1540:          * break the RAID-1.  The log structure needs not solve this
1541:          * issue at all, since by construction, the concurrent
1542:          * requests A and B for the same (virtual) block X will be
1543:          * mapped down to different linear blocks, and the latter
1544:          * request will persist correctly.
1545:          *
1546:          * Note that #ifdef DROP_WRITE_WRITE_CLASH_OPTIMIZATION then
1547:          * the above write-write conflict will not even occur since if
1548:          * concurrent writes are issued to the same block, only the
1549:          * first will succeed. We can get away with this optimization
1550:          * since POSIX does not guarantee any ordering of writes for
1551:          * uncooperating processes issuing concurrent writes.
1552:          */
1553:         return dm_io(&iorq, num_regions, where, NULL);
1554: }
1555: 
1556: static void map_rw_io_job(struct io_job *io)
1557: {
1558:         struct dm_gecko *dmg = io->dmg;
1559:         struct dm_gecko_stats *stats;
1560:         unsigned long flags;
1561:         int dispatch_io_now = 1;
1562:         struct io_for_block *io4b, *extant_io4b;
1563: 
1564:         BUG_ON(in_interrupt());
1565:         io4b = mempool_alloc(io_for_block_mempool, GFP_NOIO);
1566:         io4b->rw_cnt = 1;
1567:         io4b->flags = 0;
1568:         INIT_LIST_HEAD(&io4b->pending_io);
1569:         INIT_LIST_HEAD(&io4b->deferred_io);
1570:         io->parent = io4b;
1571:         list_add_tail(&io->list, &io4b->pending_io);
1572: 
1573:         spin_lock_irqsave(&dmg->lock, flags);
1574:         /* preemption is disabled under spinlock */
1575:         stats = this_cpu_ptr(dmg->stats);
1576: 
1577:         BUG_ON(is_block_invalid(dmg->d_map[io->v_block], dmg));
1578:         if (!is_block_marked_free(dmg->d_map[io->v_block], dmg)) {
1579:                 io->l_block = dmg->d_map[io->v_block];
1580:                 BUG_ON(dmg->r_map[io->l_block] != io->v_block);
1581:         } else {
1582:                 io->l_block = mark_block_free(dmg);
1583:         }
1584:         if (io->rw == READ) {
1585:                 ++stats->reads;
1586:                 /* optimization: WARNING, it complicates the
1587:                  * read-modify-write code paths */
1588:                 if (is_block_marked_free(dmg->d_map[io->v_block], dmg)) {
1589:                         ++stats->read_empty;
1590:                         spin_unlock_irqrestore(&dmg->lock, flags);
1591:                         if (io->page != NULL) {
1592:                                 if (bio_data_dir(io->bio) == WRITE) {
1593:                                         clear_page(io->page);
1594:                                         memcpy_bio_into_page(io);
1595:                                         io->rw = WRITE;
1596:                                         /* resubmit IO (read-modify-write) */
1597:                                         queue_deferred_io_job(io);
1598:                                         return;
1599:                                 } else {
1600:                                         free_page((unsigned long)io->page);
1601:                                         /* and continue to fall through */
1602:                                 }
1603:                         }
1604:                         DPRINTK("READ unwritten blocks, returning zeroes.");
1605:                         zero_fill_bio(io->bio);
1606: #ifdef DROP_WRITE_WRITE_CLASH_OPTIMIZATION
1607: out_without_submitting_io:
1608: #endif
1609:                         bio_endio(io->bio, 0);
1610:                         mempool_free(io, io_job_mempool);
1611:                         mempool_free(io4b, io_for_block_mempool);
1612:                         do_complete_generic(dmg);
1613:                         return;
1614:                 }
1615:         } else {
1616:                 ++stats->writes;
1617:         }
1618:         extant_io4b = get_io_for_block(dmg, io->v_block);
1619:         if (!extant_io4b) {
1620:                 put_io_for_block(dmg, io->v_block, io4b);
1621:                 io4b = NULL;        /* to prevent deallocation before ret */
1622:         } else {
1623:                 BUG_ON(list_empty(&extant_io4b->pending_io));
1624:                 /* unchain from io4b->pending_io, unnecessary op */
1625:                 list_del(&io->list);
1626:                 io->parent = extant_io4b;
1627: 
1628:                 if (extant_io4b->rw_cnt > 0) {
1629:                         /* there is concurrent IO on this block */
1630: #ifdef DROP_WRITE_WRITE_CLASH_OPTIMIZATION
1631:                         if (io->rw == WRITE) {
1632:                                 if (test_and_set_bit(WRITE_CLASH_IO_FOR_BLOCK,
1633:                                                      &io4b->flags)) {
1634:                                         ++stats->ww_clash;
1635:                                         spin_unlock_irqrestore(&dmg->lock,
1636:                                                                flags);
1637:                                         if (io->page != NULL) {
1638:                                                 free_page((unsigned long)
1639:                                                           io->page);
1640:                                                 io->page = NULL;
1641:                                         }
1642:                                         goto out_without_submitting_io;
1643:                                 }
1644:                         }
1645: #endif
1646:                         ++stats->rw_clash;
1647:                         ++extant_io4b->rw_cnt;
1648:                         list_add_tail(&io->list, &extant_io4b->pending_io);
1649:                 } else {
1650:                         /* the gc is running on this block */
1651:                         BUG_ON(extant_io4b->rw_cnt == 0); // or != -1
1652:                         ++stats->rw_gc_clash;
1653:                         dispatch_io_now = 0;
1654:                         list_add_tail(&io->list, &extant_io4b->deferred_io);
1655:                 }
1656:         }
1657:         if (dispatch_io_now && io->rw == WRITE) {
1658:                /* Unlike DEFINE_WAIT, DECLARE_WAITQUEUE uses
1659:                 * default_wake_function instead of
1660:                 * autoremove_wake_function to wake up the task. The
1661:                 * former will NOT remove the woken task from the
1662:                 * wait_queue_head_t whereas the latter will. We don't
1663:                 * want the task removed. */
1664:                 DECLARE_WAITQUEUE(__wait, current);
1665:                 if (!__no_available_blocks(dmg)) {
1666:                         goto fastpath_claim_block_for_writing;
1667:                 }
1668:                 // If there's no available space for this WRITE
1669:                 // request, block the underlying task. Note that we
1670:                 // need not force-schedule the gc to run at this
1671:                 // point, since it most likely is already
1672:                 // running. Worse case scenario, the gc will be
1673:                 // delayed-scheduled by the timer.
1674:                 __add_wait_queue(&dmg->no_free_space_waitqueue, &__wait);
1675:                 for (;;) {
1676:                         __set_current_state(TASK_UNINTERRUPTIBLE);
1677: 
1678:                         if (!__no_available_blocks(dmg)) {
1679:                                 break;
1680:                         }
1681:                         BUG_ON(in_interrupt());
1682:                         spin_unlock_irqrestore(&dmg->lock, flags);
1683:                         schedule();
1684:                         // at this point, current is in TASK_RUNNING
1685:                         spin_lock_irqsave(&dmg->lock, flags);
1686:                 }
1687:                 __set_current_state(TASK_RUNNING);
1688:                 __remove_wait_queue(&dmg->no_free_space_waitqueue, &__wait);
1689: fastpath_claim_block_for_writing:
1690:                 io->l_block = __claim_next_free_block(dmg);
1691:                 dmg->r_map[io->l_block] = io->v_block;
1692:                 atomic_inc(&seg_for_sector(
1693:                     dmg, block_to_sector(io->l_block))->pending_writes);
1694:         }
1695:         spin_unlock_irqrestore(&dmg->lock, flags);
1696: 
1697:         if (io4b != NULL) {
1698:                 mempool_free(io4b, io_for_block_mempool);
1699:         }
1700:         if (dispatch_io_now) {
1701:                 dm_dispatch_io_bio(io);
1702:         }
1703: }
1704: 
1705: static int map_rw(struct dm_gecko *dmg, struct bio *bio)
1706: {
1707:         struct io_job *io = mempool_alloc(io_job_mempool, GFP_NOIO);
1708: 
1709:         io->bio = bio;
1710:         io->dmg = dmg;
1711:         io->page = NULL;
1712: 
1713:         if (!bio_at_block_boundary(bio)) {
1714:                 struct dm_gecko_stats *stats;
1715:                 /* if not aligned at page boundary, must be less than
1716:                  * a page worth of data */
1717:                 BUG_ON(bio->bi_size >= GECKO_BLOCK_SIZE);
1718: 
1719:                 io->page = (void *)__get_free_page(GFP_NOIO);
1720:                 if (io->page == NULL) {
1721:                         mempool_free(io, io_job_mempool);
1722:                         bio_endio(bio, -ENOMEM);
1723:                         goto out_ret;
1724:                 }
1725:                 get_cpu();
1726:                 stats = this_cpu_ptr(dmg->stats);
1727:                 if (bio_data_dir(bio) == READ) {
1728:                         ++stats->subblock_reads;
1729:                 } else {
1730:                         ++stats->subblock_writes;
1731:                 }
1732:                 put_cpu();
1733: 
1734:                 io->rw = READ;  /* read-modify-update cycle, read first */
1735: 
1736:                 DPRINTK("%s request unaligned, sector(%llu) : size(%llu)",
1737:                         (bio_data_dir(bio) == READ) ? "READ" : "WRITE",
1738:                         (unsigned long long)bio->bi_sector,
1739:                         (unsigned long long)bio->bi_size);
1740:         } else {
1741:                 /* if aligned at page boundary, must be single block
1742:                  * worth of data */
1743:                 BUG_ON(bio->bi_size != GECKO_BLOCK_SIZE);
1744:                 io->rw = bio_data_dir(bio);
1745:         }
1746: 
1747:         io->v_block = sector_to_block(bio->bi_sector);
1748:         // the block must fit in the range
1749:         BUG_ON(is_block_free_or_invalid(io->v_block, dmg));
1750:         atomic_inc(&dmg->total_jobs);
1751: 
1752:         map_rw_io_job(io);
1753: out_ret:
1754:         return DM_MAPIO_SUBMITTED;
1755: }
1756: 
1757: /* TRIMs are advisory, do not issue them when there are other pending
1758:  * read/write or gc relocation/cleaning on the target block. Further,
1759:  * they are not deferred */
1760: static int map_discard(struct dm_gecko *dmg, struct bio *bio)
1761: {
1762:         unsigned long flags;
1763:         struct dm_gecko_stats *stats;
1764:         int freed_blocks = 0;
1765:         sector_t v_block = sector_to_block(bio->bi_sector);
1766: 
1767:         /* never discard block 0 which holds the superblock */
1768:         BUG_ON(v_block == 0);
1769:         spin_lock_irqsave(&dmg->lock, flags);
1770:         /* preemption is disabled under spinlock */
1771:         stats = this_cpu_ptr(dmg->stats);
1772: 
1773:         if (get_io_for_block(dmg, v_block) != NULL) {
1774:                 ++stats->dropped_discards;
1775:         } else {
1776:                 u32 l_block = dmg->d_map[v_block];
1777: 
1778:                 BUG_ON(is_block_invalid(l_block, dmg));
1779:                 if (is_block_marked_free(l_block, dmg)) {
1780:                         WARN(1, DM_GECKO_PREFIX "trim on free block!\n");
1781:                 } else {
1782:                         BUG_ON(v_block != dmg->r_map[l_block]);
1783: 
1784:                         dmg->r_map[l_block] = mark_block_free(dmg);
1785:                         dmg->d_map[v_block] = mark_block_free(dmg);
1786: 
1787:                         ++stats->discards;
1788:                         ++dmg->free_blocks;
1789:                         freed_blocks = __ffwd_tail(dmg);
1790:                 }
1791:         }
1792:         spin_unlock_irqrestore(&dmg->lock, flags);
1793: 
1794:         if (freed_blocks > 0) {
1795:                 wake_up_free_space_available(dmg);
1796:         }
1797:         /* discards are not issued since we have HDDs not SSDs */
1798:         bio_endio(bio, 0);
1799:         return DM_MAPIO_SUBMITTED;
1800: }
1801: 
1802: static int gecko_map(struct dm_target *ti, struct bio *bio,
1803:                      union map_info *map_context)
1804: {
1805:         struct dm_gecko *dmg = (struct dm_gecko *)ti->private;
1806:         int ret = DM_MAPIO_REQUEUE;
1807: 
1808:         down_read(&dmg->metadata_sync_sema);
1809: 
1810: #if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 36)
1811:         if (bio_empty_barrier(bio)) {
1812: #else
1813:         if (bio->bi_rw & REQ_FLUSH) {
1814: #endif
1815:                 struct dm_io_region where;
1816:                 struct dm_gecko_stats *stats;
1817: #if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 35)
1818:                 unsigned target_req_nr = map_context->flush_request;
1819: #else
1820:                 unsigned target_req_nr = map_context->target_request_nr;
1821: #endif
1822:                 get_cpu();
1823:                 stats = this_cpu_ptr(dmg->stats);
1824:                 ++stats->empty_barriers;
1825:                 put_cpu();
1826:                 /* TODO: fix the case when the dmg->head just advanced
1827:                  * across disks rendering the barrier ineffective on
1828:                  * the indended disks. When dmg->head linear index
1829:                  * advances to a new disk, the previous disk's mirrors
1830:                  * are to be put in a lower power state. Currently, we
1831:                  * issue a synchronous barrier so that all in-progress
1832:                  * writes will complete before continuing, also fixing
1833:                  * the case. */
1834:                 if (dmg->disk_map.layout == raid0) {
1835:                         linear_to_phy_which(dmg,
1836:                                             block_to_sector(dmg->head) +
1837:                                             target_req_nr,
1838:                                             0, &where);
1839:                 } else {
1840:                         linear_to_phy_which(dmg, block_to_sector(dmg->head),
1841:                                             target_req_nr, &where);
1842:                 }
1843:                 bio->bi_bdev = where.bdev;
1844:                 /* the empty barriers do not indicate which
1845:                  * sectors:size are sync'ed */
1846:                 DPRINTK("bio_empty_barrier device(%u:%u) (%llu:%llu) (%u)",
1847:                         MAJOR(bio->bi_bdev->bd_dev),
1848:                         MINOR(bio->bi_bdev->bd_dev),
1849:                         (unsigned long long)sector_to_block(bio->bi_sector),
1850:                         (unsigned long long)to_sector(bio->bi_size),
1851:                         target_req_nr);
1852: 
1853:                 ret = DM_MAPIO_REMAPPED;
1854:                 goto out;
1855:         }
1856:         DPRINTK("%s request for sector %llu, %u bytes",
1857:                 bio_rw(bio) == WRITE ? "WRITE" :
1858:                 (bio_rw(bio) == READA ? "READA" : "READ"),
1859:                 (unsigned long long)bio->bi_sector, bio->bi_size);
1860: 
1861:         if (bio->bi_rw & REQ_DISCARD) {
1862:                 ret = map_discard(dmg, bio);
1863:         } else {
1864:                 ret = map_rw(dmg, bio);
1865:         }
1866: out:
1867:         up_read(&dmg->metadata_sync_sema);
1868:         return ret;
1869: }
1870: 
1871: static void dm_gecko_put_devices(struct dm_target *ti, struct dm_gecko *dmg)
1872: {
1873:         struct list_head *dm_dev_segs = &dmg->disk_map.dm_dev_segs;
1874: 
1875:         while (!list_empty(dm_dev_segs)) {
1876:                 int i;
1877:                 struct dm_dev_seg *seg =
1878:                     list_entry(dm_dev_segs->next, struct dm_dev_seg, list);
1879: 
1880:                 list_del(&seg->list);
1881:                 for (i = 0; i < dmg->disk_map.stripes; i++) {
1882:                         /* initializing the disk_map NULLifies
1883:                          * ->dev[i] before dm_get_device */
1884:                          if (seg->dev[i]) {
1885:                                 dm_put_device(ti, seg->dev[i]);
1886:                          }
1887:                 }
1888:                 kfree(seg);
1889:         }
1890: }
1891: 
1892: /* Schedule power adjustment for all mirror stripes except for
1893:  * ->head_seg or ->tail_seg if gc is independent from read.
1894:  * WARNING: since the power adjustment happens asynchornously
1895:  * on the workqueue, make sure the workqueue does not disappear
1896:  * from right under.  */
1897: static void sched_delayed_power_adjustment_for_segments(struct dm_gecko *dmg)
1898: {
1899:   struct dm_dev_seg *seg;
1900:         if (test_bit(DM_GECKO_READ_TPUT, &dmg->flags)) {
1901:                 return;
1902:         }
1903:         list_for_each_entry(seg, &dmg->disk_map.dm_dev_segs, list) {
1904:                 sched_delayed_power_adjustment_for_segment(seg,
1905:                                                            dmg->low_pow_state);
1906:         }
1907: }
1908: 
1909: // This function is synchronous.
1910: static void power_up_segment(struct dm_dev_seg *seg) {
1911:         struct dm_gecko *dmg = seg->ctxt;
1912:         int i;
1913:         for (i = 0; i < dmg->disk_map.stripes; i++) {
1914:                 set_drive_power(seg->dev[i]->bdev, active);
1915:         }
1916: }
1917: 
1918: // This function is synchronous.
1919: static void power_up_all_segments(struct dm_gecko *dmg)
1920: {
1921:         struct dm_dev_seg *seg;
1922:         list_for_each_entry(seg, &dmg->disk_map.dm_dev_segs, list) {
1923:                 power_up_segment(seg);
1924:         }
1925: }
1926: 
1927: static struct dm_dev_seg *seg_alloc_and_init(gfp_t flags, struct dm_gecko *dmg)
1928: {
1929:         struct dm_dev_seg *seg = kmalloc(sizeof(*seg), flags);
1930:         int j;
1931: 
1932:         if (!seg) {
1933:                 return NULL;
1934:         }
1935:         seg->ctxt = dmg;
1936:         seg->cur_pow_state = seg->next_pow_state = unspecified;
1937:         seg->access_seq_in_log = 0;
1938:         atomic_set(&seg->pending_writes, 0);
1939:         INIT_WORK(&seg->work, run_dm_dev_seg);
1940:         for (j = 0; j < dmg->disk_map.stripes; j++) {
1941:                 /* ensure error handing works */
1942:                 seg->dev[j] = NULL;
1943:         }
1944:         return seg;
1945: }
1946: 
1947: static int load_dm_gecko(struct dm_target *ti, struct dm_gecko *dmg)
1948: {
1949:         struct dm_gecko_persistent_metadata *dmg_meta;
1950:         struct dm_gecko_dev *dmg_devs;
1951:         size_t i, map_size;
1952:         int sz, err = 0, disk_cnt;
1953:         struct file *file;
1954:         loff_t pos = 0;
1955:         mm_segment_t old_fs = get_fs();
1956: 
1957:         char *page = (char *)__get_free_page(GFP_KERNEL);
1958:         if (!page) {
1959:                 return -ENOMEM;
1960:         }
1961:         set_fs(KERNEL_DS);
1962: 
1963:         file = filp_open(dmg->meta_filename, O_LARGEFILE | O_RDONLY, 0);
1964:         if (IS_ERR(file)) {
1965:                 printk(DM_GECKO_PREFIX "open %s\n", dmg->meta_filename);
1966:                 err = PTR_ERR(file);
1967:                 goto out;
1968:         }
1969: 
1970:         sz = vfs_read(file, page, PAGE_SIZE, &pos);
1971:         if (sz != PAGE_SIZE) {
1972:                 err = (sz < 0) ? sz : -EIO;
1973:                 printk(DM_GECKO_PREFIX "read metadata %s: %d\n",
1974:                        dmg->meta_filename, err);
1975:                 goto out_close;
1976:         }
1977: 
1978:         dmg_meta = (struct dm_gecko_persistent_metadata *)page;
1979:         if (dmg_meta->magic != DM_GECKO_META_MAGIC) {
1980:                 printk(DM_GECKO_PREFIX "magic number error, endianness?\n");
1981:                 err = -EINVAL;
1982:                 goto out_close;
1983:         }
1984: 
1985:         dmg->incarnation = dmg_meta->incarnation + 1;
1986:         dmg->size = dmg_meta->size;
1987:         dmg->tail = dmg_meta->tail;
1988:         dmg->head = dmg_meta->head;
1989:         dmg->available_blocks = dmg_meta->available_blocks;
1990:         dmg->free_blocks = dmg_meta->free_blocks;
1991:         dmg->disk_map.layout = dmg_meta->layout;
1992:         dmg->disk_map.stripes = dmg_meta->stripes;
1993:         BUG_ON(dmg->disk_map.stripes > DM_GECKO_MAX_STRIPES);
1994:         dmg->flags = dmg_meta->flags;
1995:         dmg->max_gc_req_in_progress = dmg_meta->max_gc_req_in_progress;
1996:         dmg->gc_ctrl = dmg_meta->gc_ctrl;
1997:         dmg->low_pow_state = dmg_meta->low_pow_state;
1998: 
1999:         disk_cnt = dmg_meta->disk_map_cnt * dmg_meta->stripes;
2000:         if (disk_cnt * sizeof(*dmg_devs) > PAGE_SIZE) {
2001:                 printk(DM_GECKO_PREFIX "too many disks\n");
2002:                 err = -EINVAL;
2003:                 goto out_close;
2004:         }
2005: 
2006:         dmg_devs = (struct dm_gecko_dev *) (page + sizeof(*dmg_meta));
2007:         INIT_LIST_HEAD(&dmg->disk_map.dm_dev_segs);
2008:         dmg->disk_map.cnt = 0;
2009:         dmg->disk_map.len = 0;
2010:         for (i = 0; i < dmg_meta->disk_map_cnt; i++) {
2011:                 int j;
2012:                 sector_t stripes_len = 0;
2013:                 struct dm_dev_seg *seg = seg_alloc_and_init(GFP_KERNEL, dmg);
2014:                 if (!seg) {
2015:                         printk(DM_GECKO_PREFIX "kmalloc dm_dev_seg\n");
2016:                         err = -ENOMEM;
2017:                         goto out_err_1;
2018:                 }
2019:                 seg->idx = i;
2020:                 list_add_tail(&seg->list, &dmg->disk_map.dm_dev_segs);
2021: 
2022:                 for (j = 0; j < dmg_meta->stripes; j++) {
2023:                         struct dm_gecko_dev *dmg_dev =
2024:                             &dmg_devs[i * dmg_meta->stripes + j];
2025:                         err = dm_get_device(ti,
2026:                                             dmg_dev->name,
2027:                                             dm_table_get_mode(ti->table),
2028:                                             &seg->dev[j]);
2029:                         if (err) {
2030:                                 printk(DM_GECKO_PREFIX
2031:                                        "device lookup failed\n");
2032:                                 goto out_err_1;
2033:                         }
2034:                         if (seg->dev[0]->bdev->bd_inode->i_size !=
2035:                             seg->dev[j]->bdev->bd_inode->i_size) {
2036:                                 printk(DM_GECKO_PREFIX
2037:                                        "stripes must match in size "
2038:                                        "(%llu != %llu)\n",
2039:                                        seg->dev[0]->bdev->bd_inode->i_size,
2040:                                        seg->dev[j]->bdev->bd_inode->i_size);
2041:                                 err = -EINVAL;
2042:                                 goto out_err_1;
2043:                         }
2044:                         stripes_len += seg->dev[j]->bdev->bd_inode->i_size;
2045:                 }
2046: 
2047:                 seg->start = dmg->disk_map.len;
2048:                 seg->len = (dmg->disk_map.layout == raid0) ?
2049:                     (stripes_len >> SECTOR_SHIFT) :
2050:                     (seg->dev[0]->bdev->bd_inode->i_size >> SECTOR_SHIFT);
2051:                 dmg->disk_map.len += seg->len;
2052: 
2053:                 ++dmg->disk_map.cnt;
2054:         }
2055:         if (dmg->disk_map.len != ti->len) {
2056:                 printk(DM_GECKO_PREFIX
2057:                        "disk_map length != dm_target length\n");
2058:                 err = -EINVAL;
2059:                 goto out_err_1;
2060:         }
2061:         BUG_ON(dmg->size != sector_to_block(dmg->disk_map.len));
2062: 
2063:         /* allocate the maps */
2064:         map_size = PAGE_ALIGN(sizeof(*dmg->d_map) * dmg->size);
2065:         dmg->d_map = vmalloc(map_size);
2066:         if (!dmg->d_map) {
2067:                 printk(DM_GECKO_PREFIX "vmalloc ->d_map failed\n");
2068:                 err = -ENOMEM;
2069:                 goto out_err_1;
2070:         }
2071:         /* same size as direct map */
2072:         dmg->r_map = vmalloc(map_size);
2073:         if (!dmg->r_map) {
2074:                 printk(DM_GECKO_PREFIX "vmalloc ->r_map failed\n");
2075:                 err = -ENOMEM;
2076:                 goto out_err_2;
2077:         }
2078: 
2079:         /* read the maps (the maps are multiple of PAGE_SIZE for convenience) */
2080:         for (i = 0; i < map_size; i += PAGE_SIZE) {
2081:                 char *dest = &((char *)dmg->d_map)[i];
2082: 
2083:                 sz = vfs_read(file, dest, PAGE_SIZE, &pos);
2084:                 if (sz != PAGE_SIZE) {
2085:                         err = (sz < 0) ? sz : -EIO;
2086:                         printk(DM_GECKO_PREFIX "vfs_read ->d_map\n");
2087:                         goto out_err_3;
2088:                 }
2089:         }
2090:         for (i = 0; i < map_size; i += PAGE_SIZE) {
2091:                 char *dest = &((char *)dmg->r_map)[i];
2092: 
2093:                 sz = vfs_read(file, dest, PAGE_SIZE, &pos);
2094:                 if (sz != PAGE_SIZE) {
2095:                         err = (sz < 0) ? sz : -EIO;
2096:                         printk(DM_GECKO_PREFIX "vfs_read ->r_map\n");
2097:                         goto out_err_3;
2098:                 }
2099:         }
2100: 
2101: out_close:
2102:         filp_close(file, current->files);
2103: out:
2104:         set_fs(old_fs);
2105:         free_page((unsigned long)page);
2106:         return err;
2107: 
2108: out_err_1:
2109:         dm_gecko_put_devices(ti, dmg);
2110:         goto out_close;
2111: 
2112: out_err_2:
2113:         kfree(dmg->d_map);
2114:         goto out_err_1;
2115: 
2116: out_err_3:
2117:         kfree(dmg->r_map);
2118:         goto out_err_2;
2119: }
2120: 
2121: void create_artificial_metadata_maps(struct dm_gecko *dmg,
2122:                                      sector_t total_blocks,
2123:                                      int data_blocks,
2124:                                      int free_blocks) {
2125:         u32 vblock = 0;
2126:         enum fake_writing {
2127:                 FAKE_WR_BLOCKS,
2128:                 FAKE_WR_HOLES,
2129:         };
2130:         enum fake_writing state = FAKE_WR_BLOCKS;  // start by writing blocks
2131:         int blocks_written = 0;
2132:         int holes_written = 0;
2133:         sector_t size = dmg->tail + total_blocks;
2134:         BUG_ON(size > 0xffffffff);
2135:         BUG_ON(size > dmg->size);
2136:         set_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags);  // force-stop the gc
2137:         for (dmg->head = dmg->tail; dmg->head < size; ++dmg->head) {
2138:                 // The entire map contains `holes'.
2139:                 BUG_ON(dmg->r_map[dmg->head] != mark_block_free(dmg));
2140:                 switch (state) {
2141:                 case FAKE_WR_BLOCKS:
2142:                         dmg->r_map[dmg->head] = vblock;
2143:                         dmg->d_map[vblock] = dmg->head;
2144:                         ++vblock;
2145:                         --dmg->persistent_available_blocks;
2146:                         --dmg->available_blocks;
2147:                         --dmg->free_blocks;
2148:                         if (++blocks_written >= data_blocks) {
2149:                                 holes_written = 0;
2150:                                 state = FAKE_WR_HOLES;
2151:                         }
2152:                         break;
2153:                 case FAKE_WR_HOLES:
2154:                         if (++holes_written >= free_blocks) {
2155:                                 blocks_written = 0;
2156:                                 state = FAKE_WR_BLOCKS;
2157:                         }
2158:                         break;
2159:                 default:
2160:                         printk(DM_GECKO_PREFIX
2161:                                "invalid artificial metadata map write state\n");
2162:                         BUG_ON(1);
2163:                         break;
2164:                 }
2165:         }
2166: }
2167: 
2168: /*
2169:  * insmod dm-gecko_mod.ko <persistent (true=1 | false)>
2170:  * <metadata-file> [<layout ("linear" | "raid1" | "raid0")>]
2171:  * [<# of stripes>] <# of devices> [<dev_path>]+ ]?
2172:  *
2173:  * Note that a "linear" layout is equivalent to a "raid0" or a "raid1"
2174:  * layout with a single stripe (it exists for historical reasons---the
2175:  * first one to be developed).
2176:  *
2177:  * If (!persistent) then the layout, number and device paths are
2178:  * irrelevant.  Otherwise, the metadata is persistently saved when the
2179:  * target it destroyed.  The metadata should also be synchronized to
2180:  * persistent storage periodically and perhaps only dirty mappings
2181:  * should be updated (i.e. as results of reads). */
2182: static int gecko_ctr(struct dm_target *ti, unsigned int argc, char *argv[])
2183: {
2184:         int err = -ENOMEM, persistent, i, dm_devs, arg = 0;
2185:         char *end;
2186:         u32 mapidx;
2187: 
2188:         struct dm_gecko *dmg = kmalloc(sizeof(*dmg), GFP_KERNEL);
2189:         if (!dmg) {
2190:                 ti->error = DM_GECKO_PREFIX "unable to allocate gecko context";
2191:                 goto out1;
2192:         }
2193:         memset(dmg, 0, sizeof(*dmg));        /* zeros out the stats as well */
2194:         // TODO: agree on a set of default startup flags
2195:         set_bit(DM_GECKO_READ_TPUT, &dmg->flags);
2196:         spin_lock_init(&dmg->lock);
2197:         init_waitqueue_head(&dmg->jobs_pending_waitqueue);
2198:         init_waitqueue_head(&dmg->no_free_space_waitqueue);
2199:         init_rwsem(&dmg->metadata_sync_sema);
2200:         atomic_set(&dmg->total_jobs, 0);
2201:         hrtimer_init(&dmg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2202:         dmg->timer.function = fire_gc_timer;
2203:         INIT_WORK(&dmg->gc_work, try_sched_gc);
2204:         atomic_set(&dmg->gc_work_scheduled_by_timer, 0);
2205:         atomic_set(&dmg->timer_active, 1);
2206:         INIT_WORK(&dmg->sync_metadata_work, sync_metadata);
2207:         dmg->gc_req_in_progress = 0;
2208:         dmg->max_gc_req_in_progress = GC_CONCURRENT_REQ;
2209:         dmg->low_pow_state = DEFAULT_LOW_POW_STATE;
2210:         dmg->incarnation = 1;
2211: 
2212:         if (!(dmg->stats = alloc_percpu(struct dm_gecko_stats))) {
2213:                 ti->error = DM_GECKO_PREFIX "unable to alloc_percpu stats";
2214:                 printk("%s\n", ti->error);
2215:                 goto out2;
2216:         }
2217: 
2218: #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
2219:         dmg->kcopyd_client = dmg_kcopyd_client_create();
2220:         err = IS_ERR(dmg->kcopyd_client) ? PTR_ERR(dmg->kcopyd_client) : 0;
2221: #else
2222:         err = dmg_kcopyd_client_create(DM_GECKO_GC_COPY_PAGES,
2223:                                        &dmg->kcopyd_client);
2224: #endif
2225:         if (err) {
2226:                 ti->error =
2227:                     DM_GECKO_PREFIX "unable to register as a kcopyd client";
2228:                 printk("%s\n", ti->error);
2229:                 goto out3;
2230:         }
2231: 
2232:         dmg->io_client = dm_io_client_create(DM_GECKO_GC_COPY_PAGES);
2233:         if (IS_ERR(dmg->io_client)) {
2234:                 ti->error =
2235:                     DM_GECKO_PREFIX "unable to register as an io client";
2236:                 err = PTR_ERR(dmg->io_client);
2237:                 printk("%s, errno=%d\n", ti->error, err);
2238:                 goto out4;
2239:         }
2240: 
2241:         /* parse args */
2242:         persistent = simple_strtol(argv[arg++], &end, 10);
2243:         if (*end) {
2244:                 ti->error = DM_GECKO_PREFIX "invalid persistence arg";
2245:                 printk("%s\n", ti->error);
2246:                 err = -EINVAL;
2247:                 goto out5;
2248:         }
2249: 
2250:         dmg->meta_filename = kstrdup(argv[arg++], GFP_KERNEL);
2251:         if (!dmg->meta_filename) {
2252:                 ti->error = DM_GECKO_PREFIX "unable to kstrdup meta-filename";
2253:                 printk("%s\n", ti->error);
2254:                 err = -ENOMEM;
2255:                 goto out5;
2256:         }
2257: 
2258:         if (persistent) {
2259:                 err = load_dm_gecko(ti, dmg);
2260:                 if (err) {
2261:                         ti->error =
2262:                             DM_GECKO_PREFIX
2263:                             "unable to load gecko from meta-file";
2264:                         printk("%s\n", ti->error);
2265:                         goto out5;
2266:                 }
2267:                 goto dm_maps_ready;
2268:         }
2269: 
2270:         if (!persistent && argc < 5) {
2271:                 ti->error = DM_GECKO_PREFIX "insufficient arguments";
2272:                 printk("%s\n", ti->error);
2273:                 err = -EINVAL;
2274:                 goto out5;
2275:         }
2276: 
2277:         if (strcmp(argv[arg], "linear") == 0) {
2278:                 dmg->disk_map.layout = linear;
2279:         } else if (strcmp(argv[arg], "raid1") == 0) {
2280:                 dmg->disk_map.layout = raid1;
2281:         } else if (strcmp(argv[arg], "raid0") == 0) {
2282:                 dmg->disk_map.layout = raid0;
2283:         } else {
2284:                 ti->error = DM_GECKO_PREFIX "invalid layout";
2285:                 printk("%s\n", ti->error);
2286:                 err = -EINVAL;
2287:                 goto out5;
2288:         }
2289:         ++arg;
2290:         if (dmg->disk_map.layout == raid1 || dmg->disk_map.layout == raid0) {
2291:                 dmg->disk_map.stripes = simple_strtoul(argv[arg++], &end, 10);
2292:                 if (*end || dmg->disk_map.stripes > DM_GECKO_MAX_STRIPES) {
2293:                         ti->error = DM_GECKO_PREFIX "invalid number of stripes";
2294:                         printk("%s\n", ti->error);
2295:                         err = -EINVAL;
2296:                         goto out5;
2297:                 }
2298:         } else {
2299:                 dmg->disk_map.stripes = 1;
2300:         }
2301:         printk(DM_GECKO_PREFIX "# of stripes: %d\n", dmg->disk_map.stripes);
2302:         dm_devs = simple_strtoul(argv[arg++], &end, 10);
2303:         if (!(*end)) {
2304:           printk(DM_GECKO_PREFIX "# of devices: %d\n", dm_devs);
2305:         }
2306:         if (!dm_devs || *end || dm_devs != (argc - arg) ||
2307:             ((dmg->disk_map.layout == raid1 || dmg->disk_map.layout == raid0) &&
2308:              (dm_devs % dmg->disk_map.stripes != 0))) {
2309:                 ti->error = DM_GECKO_PREFIX "invalid number of devices";
2310:                 printk("%s\n", ti->error);
2311:                 err = -EINVAL;
2312:                 goto out5;
2313:         }
2314: 
2315:         INIT_LIST_HEAD(&dmg->disk_map.dm_dev_segs);
2316:         dmg->disk_map.cnt = dm_devs / dmg->disk_map.stripes;
2317:         dmg->disk_map.len = 0;
2318:         for (i = 0; i < dmg->disk_map.cnt; i++) {
2319:                 int j;
2320:                 sector_t stripes_len = 0;
2321: 
2322:                 struct dm_dev_seg *seg = seg_alloc_and_init(GFP_KERNEL, dmg);
2323:                 if (!seg) {
2324:                         ti->error = DM_GECKO_PREFIX "kmalloc dm_dev_seg";
2325:                         printk("%s\n", ti->error);
2326:                         err = -ENOMEM;
2327:                         goto out6;
2328:                 }
2329:                 seg->idx = i;
2330:                 list_add_tail(&seg->list, &dmg->disk_map.dm_dev_segs);
2331: 
2332:                 for (j = 0; j < dmg->disk_map.stripes; j++) {
2333:                         err = dm_get_device(
2334:                             ti,
2335:                             argv[arg + (i * dmg->disk_map.stripes + j)],
2336:                             dm_table_get_mode(ti->table),
2337:                             &seg->dev[j]);
2338:                         if (err) {
2339:                                 ti->error =
2340:                                     DM_GECKO_PREFIX "device lookup failed";
2341:                                 printk("%s\n", ti->error);
2342:                                 goto out6;
2343:                         }
2344:                         // TODO(tudorm): take the min size
2345:                         if (seg->dev[0]->bdev->bd_inode->i_size !=
2346:                             seg->dev[j]->bdev->bd_inode->i_size) {
2347:                                 ti->error =
2348:                                     DM_GECKO_PREFIX
2349:                                     "stripes must match in size";
2350:                                 printk("%s (%llu != %llu)\n", ti->error,
2351:                                        seg->dev[0]->bdev->bd_inode->i_size,
2352:                                        seg->dev[j]->bdev->bd_inode->i_size);
2353:                                 err = -EINVAL;
2354:                                 goto out6;
2355:                         }
2356:                         stripes_len += seg->dev[j]->bdev->bd_inode->i_size;
2357:                         printk(DM_GECKO_PREFIX "added disk for stripe %d:%d\n",
2358:                                i, j);
2359:                 }
2360:                 seg->start = dmg->disk_map.len;
2361:                 seg->len = (dmg->disk_map.layout == raid0) ?
2362:                   (stripes_len >> SECTOR_SHIFT) :
2363:                   (seg->dev[0]->bdev->bd_inode->i_size >> SECTOR_SHIFT);
2364:                 printk(DM_GECKO_PREFIX "sector %d start=%ld and len=%ld\n",
2365:                         seg->idx, seg->start, seg->len);
2366:                 dmg->disk_map.len += seg->len;
2367:         }
2368:         if (dmg->disk_map.len != ti->len) {
2369:                 ti->error =
2370:                     DM_GECKO_PREFIX "disk_map length != dm_target length";
2371:                 printk("%s\n", ti->error);
2372:                 err = -EINVAL;
2373:                 goto out6;
2374:         }
2375: 
2376:         if (sector_to_block(dmg->disk_map.len) > 0xffffffff-1) {
2377:                 ti->error = DM_GECKO_PREFIX "unsupported size (too large)";
2378:                 printk("%s \n", ti->error);
2379:                 err = -EINVAL;
2380:                 goto out6;
2381:         }
2382:         dmg->size = sector_to_block(dmg->disk_map.len);
2383:         /* (dmg->size-1) for circular buffer logic: one slot wasted to
2384:          * distinguish between full and empty circular buffer. */
2385:         dmg->available_blocks = dmg->free_blocks = dmg->size-1;
2386:         dmg->gc_ctrl.low_watermark = GC_DEFAULT_LOW_WATERMARK;
2387:         dmg->gc_ctrl.high_watermark = GC_DEFAULT_HIGH_WATERMARK;
2388: 
2389:         /* Allocate the maps, initialize them, and also initialize the
2390:          * circular pointers. The maps are page aligned and their size
2391:          * is also a multiple of PAGE_SIZE to simplify the potentially
2392:          * selective writing of the metadata. */
2393:         dmg->d_map = vmalloc(PAGE_ALIGN(sizeof(*dmg->d_map) * dmg->size));
2394:         if (!dmg->d_map) {
2395:                 ti->error = DM_GECKO_PREFIX "vmalloc ->d_map failed";
2396:                 printk("%s\n", ti->error);
2397:                 err = -ENOMEM;
2398:                 goto out6;
2399:         }
2400:         dmg->r_map = vmalloc(PAGE_ALIGN(sizeof(*dmg->r_map) * dmg->size));
2401:         if (!dmg->r_map) {
2402:                 ti->error = DM_GECKO_PREFIX "vmalloc ->r_map failed";
2403:                 printk("%s\n", ti->error);
2404:                 err = -ENOMEM;
2405:                 goto out7;
2406:         }
2407:         for (mapidx = 0; mapidx < dmg->size; ++mapidx) {
2408:                 dmg->d_map[mapidx] = mark_block_free(dmg);
2409:                 dmg->r_map[mapidx] = mark_block_free(dmg);
2410:         }
2411: 
2412:         dmg->tail = dmg->head = 0;
2413:         // Write at 120MB/s for some time and spill into next disk
2414:         // breaks the VM loop devices since they are fairly small
2415: 
2416:         /*
2417: #define CLOSE_TO_MAX_SIZE_OFFSET (120 * 120 * (1024 * 1024 / GECKO_BLOCK_SIZE))
2418:         if (dmg->size < CLOSE_TO_MAX_SIZE_OFFSET) {
2419:                 printk(DM_GECKO_PREFIX "WARNING, discarding overflow math\n");
2420:         } else {
2421:                 dmg->tail = dmg->head = dmg->size - CLOSE_TO_MAX_SIZE_OFFSET;
2422:         }
2423:         */
2424:         /*
2425:         create_artificial_metadata_maps(dmg, dmg->size/2 + (10 * 1024),
2426:                                         //dmg->size/16, 1);
2427:                                         256, 256);
2428:         */
2429: dm_maps_ready:
2430: 
2431:         /* alloc the htable of IO requests in-progress */
2432:         dmg->buckets =
2433:             kmalloc(sizeof(struct list_head) * HASH_TABLE_SIZE, GFP_KERNEL);
2434:         if (!dmg->buckets) {
2435:                 ti->error = DM_GECKO_PREFIX "kmalloc htable failed";
2436:                 printk("%s\n", ti->error);
2437:                 err = -ENOMEM;
2438:                 goto out8;
2439:         }
2440: 
2441:         for (i = 0; i < HASH_TABLE_SIZE; i++)
2442:                 INIT_LIST_HEAD(&dmg->buckets[i]);
2443:         dmg->htable_size = 0;        /* rendered redundant by memset */
2444:         dmg->head_seg = seg_for_sector(dmg, block_to_sector(dmg->head));
2445:         ++dmg->head_seg->access_seq_in_log;
2446:         dmg->tail_seg = seg_for_sector(dmg, block_to_sector(dmg->tail));
2447: 
2448:         ti->split_io = GECKO_SECTORS_PER_BLOCK;  /* size in # of sectors */
2449:         ti->num_flush_requests = dmg->disk_map.stripes;
2450: #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 36)
2451:         ti->num_discard_requests = dmg->disk_map.stripes;
2452: #endif
2453:         ti->private = dmg;
2454:         sched_delayed_power_adjustment_for_segments(dmg);
2455: 
2456:         // start timer right before returning
2457:         dmg->timer_delay = ktime_set(GECKO_TIMER_PERIOD_SECS,
2458:                                      GECKO_TIMER_PERIOD_NSECS);
2459:         if ((err = hrtimer_start(&dmg->timer,
2460:                                  dmg->timer_delay,
2461:                                  HRTIMER_MODE_REL)) != 0) {
2462:           ti->error = DM_GECKO_PREFIX "hrtimer_start failed";
2463:           printk("%s\n", ti->error);
2464:           goto out8;
2465:         }
2466: 
2467:         printk(DM_GECKO_PREFIX "gecko_ctr done (dm_gecko incarnation %llu).\n",
2468:                dmg->incarnation);
2469:         return 0;
2470: out8:
2471:         vfree(dmg->r_map);
2472: out7:
2473:         vfree(dmg->d_map);
2474: out6:
2475:         dm_gecko_put_devices(ti, dmg);
2476: out5:
2477:         dm_io_client_destroy(dmg->io_client);
2478: out4:
2479:         dmg_kcopyd_client_destroy(dmg->kcopyd_client);
2480: out3:
2481:         free_percpu(dmg->stats);
2482: out2:
2483:         kfree(dmg);
2484: out1:
2485:         return err;
2486: }
2487: 
2488: static long (*sys_rename_wrapper)(const char __user *oldname,
2489:                                   const char __user *newname) = NULL;
2490: 
2491: static int store_dm_gecko(struct dm_gecko *dmg)
2492: {
2493:         struct dm_gecko_persistent_metadata* dmg_meta;
2494:         struct dm_gecko_dev *dmg_devs;
2495:         struct dm_dev_seg *seg;
2496:         size_t i, map_size;
2497:         int sz, err = 0;
2498:         u32 rand_bytes;
2499:         struct file *file;
2500:         loff_t pos = 0;
2501:         char *meta_filename_tmp, *page;
2502:         char *dmg_devs_offset;
2503:         mm_segment_t old_fs = get_fs();
2504: 
2505:         page = (char *)__get_free_page(GFP_KERNEL);
2506:         if (!page)
2507:                 return -ENOMEM;
2508: 
2509:         if (sys_rename_wrapper != NULL) {
2510:                 // the temp filename consists of the original filename
2511:                 // concatenated with the hex value of sizeof(rand_bytes)
2512:                 // random bytes (a nibble is represented by one character).
2513:                 meta_filename_tmp = kmalloc(strlen(dmg->meta_filename) + 1 +
2514:                         sizeof(rand_bytes) * 2, GFP_KERNEL);
2515:                 if (!meta_filename_tmp) {
2516:                         err = -ENOMEM;
2517:                         goto out_free_page;
2518:                 }
2519:                 get_random_bytes(&rand_bytes, sizeof(rand_bytes));
2520:                 sprintf(meta_filename_tmp, "%s%x",
2521:                         dmg->meta_filename, rand_bytes);
2522:         } else {
2523:                 meta_filename_tmp = dmg->meta_filename;
2524:         }
2525:         set_fs(KERNEL_DS);
2526: 
2527:         file = filp_open(meta_filename_tmp,
2528:                          O_LARGEFILE | O_WRONLY | O_CREAT | O_TRUNC, 0644);
2529:         if (!file) {
2530:                 printk(DM_GECKO_PREFIX "open %s\n", dmg->meta_filename);
2531:                 err = -EIO;
2532:                 goto out;
2533:         }
2534: 
2535:         dmg_meta = (struct dm_gecko_persistent_metadata *)page;
2536: 
2537:         dmg_meta->incarnation = dmg->incarnation;
2538:         dmg_meta->magic = DM_GECKO_META_MAGIC;
2539:         dmg_meta->size = dmg->size;
2540:         dmg_meta->tail = dmg->tail;
2541:         dmg_meta->head = dmg->head;
2542:         dmg_meta->available_blocks = dmg->available_blocks;
2543:         dmg_meta->free_blocks = dmg->free_blocks;
2544:         dmg_meta->layout = dmg->disk_map.layout;
2545:         dmg_meta->stripes = dmg->disk_map.stripes;
2546:         dmg_meta->disk_map_cnt = dmg->disk_map.cnt;
2547:         dmg_meta->flags = dmg->flags;
2548:         dmg_meta->max_gc_req_in_progress = dmg->max_gc_req_in_progress;
2549:         // Clear the volatile flags.
2550:         clear_bit(DM_GECKO_GC_FORCE_STOP, &dmg_meta->flags);
2551:         clear_bit(DM_GECKO_FINAL_SYNC_METADATA, &dmg_meta->flags);
2552:         clear_bit(DM_GECKO_SYNCING_METADATA, &dmg_meta->flags);
2553:         dmg_meta->gc_ctrl = dmg->gc_ctrl;
2554:         dmg_meta->low_pow_state = dmg->low_pow_state;
2555: 
2556:         dmg_devs_offset = page + sizeof(*dmg_meta);
2557:         dmg_devs = (struct dm_gecko_dev *)dmg_devs_offset;
2558:         if (dmg_meta->disk_map_cnt * dmg_meta->stripes * sizeof(*dmg_devs) >
2559:             PAGE_SIZE - sizeof(*dmg_meta)) {
2560:                 printk(DM_GECKO_PREFIX "metadata too large (too many disks)\n");
2561:                 err = -EINVAL;
2562:                 goto out_close;
2563:         }
2564: 
2565:         /* populate the disk map w/ disk device names */
2566:         list_for_each_entry(seg, &dmg->disk_map.dm_dev_segs, list) {
2567:                 int i;
2568:                 for (i = 0; i < dmg->disk_map.stripes; i++) {
2569:                         dev_t _dev;
2570: 
2571:                         BUG_ON(seg->dev[i] == NULL);
2572: 
2573:                         _dev = seg->dev[i]->bdev->bd_dev;
2574:                         sprintf(dmg_devs->name, "%u:%u", MAJOR(_dev),
2575:                                 MINOR(_dev));
2576:                         ++dmg_devs;
2577:                 }
2578:         }
2579: 
2580:         BUG_ON(((unsigned long)dmg_devs) - ((unsigned long)dmg_devs_offset)
2581:                > PAGE_SIZE);
2582:         /* Write PAGE_SIZE worth of data, to align subsequent maps */
2583:         sz = vfs_write(file, page, PAGE_SIZE, &pos);
2584:         if (sz != PAGE_SIZE) {
2585:                 err = (sz < 0) ? sz : -EIO;
2586:                 printk(DM_GECKO_PREFIX "vfs_write metadata, dev-map %d\n", err);
2587:                 goto out_close;
2588:         }
2589: 
2590:         /* Write the maps, both maps have the same size, further, the
2591:          * allocated (but possibly unused) size of the maps is a
2592:          * multiple of PAGE_SIZE to make potentially selective
2593:          * metadata writing easier (and more efficient). */
2594:         map_size = PAGE_ALIGN(sizeof(*dmg->d_map) * dmg->size);
2595:         for (i = 0; i < map_size; i += PAGE_SIZE) {
2596:                 char *src = &((char *)dmg->d_map)[i];
2597: 
2598:                 sz = vfs_write(file, src, PAGE_SIZE, &pos);
2599:                 if (sz != PAGE_SIZE) {
2600:                         err = (sz < 0) ? sz : -EIO;
2601:                         printk(DM_GECKO_PREFIX "vfs_write ->d_map\n");
2602:                         goto out_close;
2603:                 }
2604:         }
2605:         for (i = 0; i < map_size; i += PAGE_SIZE) {
2606:                 char *src = &((char *)dmg->r_map)[i];
2607: 
2608:                 sz = vfs_write(file, src, PAGE_SIZE, &pos);
2609:                 if (sz != PAGE_SIZE) {
2610:                         err = (sz < 0) ? sz : -EIO;
2611:                         printk(DM_GECKO_PREFIX "vfs_write ->r_map\n");
2612:                         goto out_close;
2613:                 }
2614:         }
2615: 
2616: out_close:
2617:         filp_close(file, current->files);
2618: 
2619:         if (sys_rename_wrapper != NULL) {
2620:                 err = sys_rename_wrapper(meta_filename_tmp, dmg->meta_filename);
2621:                 if (err) {
2622:                         printk(DM_GECKO_PREFIX "sys_rename: %d\n", err);
2623:                         goto out;
2624:                 }
2625:         }
2626: 
2627: out:
2628:         if (sys_rename_wrapper != NULL) {
2629:                 kfree(meta_filename_tmp);
2630:         }
2631: out_free_page:
2632:         free_page((unsigned long)page);
2633:         set_fs(old_fs);
2634:         return err;
2635: }
2636: 
2637: static void gecko_dtr(struct dm_target *ti)
2638: {
2639:         // At this point, `dmsetup message' cannot be issued against
2640:         // the module any longer, therefore only the extant
2641:         // metadata-sync and gc may be running (besides regular IOs
2642:         // that have not yet completed).
2643:         struct dm_gecko *dmg = (struct dm_gecko *)ti->private;
2644: 
2645:         // Wait for pending metadata sync to complete.
2646:         down_write(&dmg->metadata_sync_sema);
2647:         // Never clear this bit, the module is about to be unloaded.
2648:         set_bit(DM_GECKO_FINAL_SYNC_METADATA, &dmg->flags);
2649:         up_write(&dmg->metadata_sync_sema);
2650: 
2651:         set_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags);
2652:         // Metadata sync may have restarted the timer upon exit.
2653:         printk(DM_GECKO_PREFIX "hrtimer destroyed\n");
2654:         atomic_set(&dmg->timer_active, 0);
2655:         hrtimer_cancel(&dmg->timer);
2656:         // Wait for pending IOs to complete.
2657:         wait_event(dmg->jobs_pending_waitqueue, !atomic_read(&dmg->total_jobs));
2658:         dmg_kcopyd_client_destroy(dmg->kcopyd_client);
2659:         dm_io_client_destroy(dmg->io_client);
2660:         store_dm_gecko(dmg);
2661:         // WARNING, must be done before put_devices.
2662:         power_up_all_segments(dmg);
2663:         dm_gecko_put_devices(ti, dmg);
2664:         vfree(dmg->d_map);
2665:         vfree(dmg->r_map);
2666:         kfree(dmg->buckets);
2667:         free_percpu(dmg->stats);
2668:         kfree(dmg->meta_filename);
2669:         kfree(dmg);
2670:         printk(DM_GECKO_PREFIX "gecko_dtr done.\n");
2671: }
2672: 
2673: static int gecko_status(struct dm_target *ti, status_type_t type,
2674:                         char *result, unsigned int maxlen)
2675: {
2676:         struct dm_gecko *dmg = (struct dm_gecko *)ti->private;
2677:         int cpu, sz = 0;        /* sz is used by DMEMIT */
2678:         struct dm_gecko_stats aggregate_stats, *cursor;
2679:         struct deferred_stats aggregate_def_stats, *def_stats;
2680: 
2681:         memset(&aggregate_stats, 0, sizeof(aggregate_stats));
2682:         memset(&aggregate_def_stats, 0, sizeof(aggregate_def_stats));
2683: 
2684:         for_each_possible_cpu(cpu) {
2685:                 cursor = per_cpu_ptr(dmg->stats, cpu);
2686: 
2687:                 aggregate_stats.reads += cursor->reads;
2688:                 aggregate_stats.writes += cursor->writes;
2689:                 aggregate_stats.subblock_reads += cursor->subblock_reads;
2690:                 aggregate_stats.subblock_writes += cursor->subblock_writes;
2691:                 aggregate_stats.gc += cursor->gc;
2692:                 aggregate_stats.discards += cursor->discards;
2693:                 aggregate_stats.dropped_discards += cursor->dropped_discards;
2694:                 aggregate_stats.empty_barriers += cursor->empty_barriers;
2695:                 aggregate_stats.gc_recycle += cursor->gc_recycle;
2696:                 aggregate_stats.rw_clash += cursor->rw_clash;
2697:                 aggregate_stats.rw_gc_clash += cursor->rw_gc_clash;
2698:                 aggregate_stats.gc_clash += cursor->gc_clash;
2699:                 aggregate_stats.gc_rw_clash += cursor->gc_rw_clash;
2700:                 aggregate_stats.ww_clash += cursor->ww_clash;
2701:                 aggregate_stats.read_empty += cursor->read_empty;
2702:                 aggregate_stats.read_err += cursor->read_err;
2703:                 aggregate_stats.write_err += cursor->write_err;
2704:                 aggregate_stats.kcopyd_err += cursor->kcopyd_err;
2705:                 aggregate_stats.sb_read += cursor->sb_read;
2706:                 aggregate_stats.sb_write += cursor->sb_write;
2707: 
2708:                 def_stats = &per_cpu(deferred_stats, cpu);
2709:                 aggregate_def_stats.gc += def_stats->gc;
2710:                 aggregate_def_stats.rw += def_stats->rw;
2711:                 aggregate_def_stats.total += def_stats->total;
2712:         }
2713: 
2714:         switch (type) {
2715:         case STATUSTYPE_INFO:
2716:                 DMEMIT("reads(%llu), writes(%llu), "
2717:                        "subblock_reads(%llu), subblock_writes(%llu), "
2718:                        "gc(%llu), discards(%llu), dropped_discards(%llu), "
2719:                        "empty_barriers(%llu), "
2720:                        "gc_recycle(%llu), rw_clash(%llu), rw_gc_clash(%llu), "
2721:                        "gc_clash(%llu), gc_rw_clash(%llu), ww_clash(%llu), "
2722:                        "read_empty (%llu), read_err(%llu), write_err(%llu), "
2723:                        "kcopyd_err(%llu), sb_read(%llu), sb_write(%llu), "
2724:                        "deferred_gc(%llu) deferred_rw(%llu), "
2725:                        "deferred_total(%llu), total_jobs(%d)",
2726:                        aggregate_stats.reads,
2727:                        aggregate_stats.writes,
2728:                        aggregate_stats.subblock_reads,
2729:                        aggregate_stats.subblock_writes,
2730:                        aggregate_stats.gc,
2731:                        aggregate_stats.discards,
2732:                        aggregate_stats.dropped_discards,
2733:                        aggregate_stats.empty_barriers,
2734:                        aggregate_stats.gc_recycle,
2735:                        aggregate_stats.rw_clash,
2736:                        aggregate_stats.rw_gc_clash,
2737:                        aggregate_stats.gc_clash,
2738:                        aggregate_stats.gc_rw_clash,
2739:                        aggregate_stats.ww_clash,
2740:                        aggregate_stats.read_empty,
2741:                        aggregate_stats.read_err,
2742:                        aggregate_stats.write_err,
2743:                        aggregate_stats.kcopyd_err,
2744:                        aggregate_stats.sb_read,
2745:                        aggregate_stats.sb_write,
2746:                        aggregate_def_stats.gc,
2747:                        aggregate_def_stats.rw,
2748:                        aggregate_def_stats.total,
2749:                        atomic_read(&dmg->total_jobs));
2750:                 break;
2751:         case STATUSTYPE_TABLE:
2752:                 DMEMIT("mode(%s{%d} | %s | %s | %s | %s) size(%lu), "
2753:                        "htable_size(%lu), "
2754:                        "tail(%lu|%d), head(%lu|%d), available_blocks(%lu), "
2755:                        "free_blocks(%lu), used_blocks(%lu), "
2756:                        "unavailable_blocks(%lu), "
2757:                        "relocatable_blocks(%lu), gc_req_in_progress(%lu), "
2758:                        "tail_wrap_around(%lu), head_wrap_around(%lu)",
2759:                        dmg->disk_map.layout == linear ? "linear" :
2760:                        (dmg->disk_map.layout == raid1 ? "raid1" : 
2761:                         (dmg->disk_map.layout == raid0 ? "raid0" : "unknown")),
2762:                        dmg->disk_map.stripes,
2763:                        test_bit(DM_GECKO_GC_FORCE_STOP,
2764:                                 &dmg->flags) ? "gc-off"
2765:                        : (test_bit(DM_GECKO_GC_STARTED, &dmg->flags) ?
2766:                           "gc-on" : "gc-idle"),
2767:                        test_bit(DM_GECKO_READ_TPUT,
2768:                                 &dmg->flags) ? "max-read-tput" : "low-power",
2769:                        test_bit(DM_GECKO_INDEPENDENT_GC,
2770:                                 &dmg->flags) ? "gc-independent" : "gc-random",
2771:                        test_bit(DM_GECKO_SYNCING_METADATA,
2772:                                 &dmg->flags) ? "SYNC-METADATA-ON"
2773:                        : "SYNC-METADATA-OFF",
2774:                        (long unsigned)dmg->size,
2775:                        (long unsigned)dmg->htable_size,
2776:                        (long unsigned)dmg->tail, dmg->tail_seg->idx,
2777:                        (long unsigned)dmg->head, dmg->head_seg->idx,
2778:                        (long unsigned)dmg->available_blocks,
2779:                        (long unsigned)dmg->free_blocks,
2780:                        (long unsigned)__used_blocks(dmg),
2781:                        (long unsigned)__unavailable_blocks(dmg),
2782:                        (long unsigned)__relocatable_blocks(dmg),
2783:                        (long unsigned)dmg->gc_req_in_progress,
2784:                        dmg->tail_wrap_around,
2785:                        dmg->head_wrap_around);
2786:                 if (test_bit(DM_GECKO_STATUS_DETAILED, &dmg->flags)) {
2787: #define DMG_STATE0 0
2788: #define DMG_STATE1 1
2789:                         u32 tail, loopcnt, cursor, next_free, total_free;
2790:                         u32 automata_state = DMG_STATE0;
2791: 
2792:                         DMEMIT("\n" DM_GECKO_PREFIX
2793:                                "detail: tail=%lu, head=%lu\n",
2794:                                (long unsigned)dmg->tail,
2795:                                (long unsigned)dmg->head);
2796: 
2797:                         tail = dmg->tail;
2798:                         cursor = dmg->tail;
2799:                         next_free = dmg->tail;
2800:                         total_free = 0;
2801:                         for (loopcnt = 0; loopcnt < MAX_DETAIL_LOG_LOOP_CNT;) {
2802: 
2803:                                 if (cursor == dmg->head)
2804:                                         break;
2805: 
2806:                                 switch (automata_state) {
2807:                                 case DMG_STATE0:
2808:                                         if (is_block_marked_free
2809:                                             (dmg->r_map[cursor], dmg)) {
2810:                                                 next_free = cursor;
2811:                                                 total_free = 1;
2812:                                                 automata_state = DMG_STATE1;
2813:                                         }
2814:                                         break;
2815:                                 case DMG_STATE1:
2816:                                         if (!is_block_marked_free
2817:                                             (dmg->r_map[cursor], dmg)) {
2818:                                                 DMEMIT("%lu:%lu\n",
2819:                                                        (long unsigned)
2820:                                                        next_free,
2821:                                                        (long unsigned)
2822:                                                        total_free);
2823:                                                 ++loopcnt;
2824:                                                 total_free = 0;
2825:                                                 automata_state = DMG_STATE0;
2826:                                         }
2827:                                         break;
2828:                                 default:
2829:                                         BUG_ON(1);
2830:                                 }
2831: 
2832:                                 if ((++cursor) == dmg->size) {
2833:                                         /* wrap around */
2834:                                         cursor = 0;
2835:                                 }
2836:                         }
2837:                 }
2838:                 break;
2839:         }
2840: 
2841:         return 0;
2842: }
2843: 
2844: static int gecko_message(struct dm_target *ti, unsigned argc, char **argv)
2845: {
2846:         struct dm_gecko *dmg = (struct dm_gecko *)ti->private;
2847: 
2848:         if (argc < 1 || argc > 3) {
2849:                 ti->error = DM_GECKO_PREFIX "invalid number of arguments";
2850:                 goto bad;
2851:         }
2852: 
2853:         if (strcmp(argv[0], "set-low-power") == 0) {
2854:                 if (argc == 2) {
2855:                         if (strcmp(argv[1], "sleep") == 0) {
2856:                                 dmg->low_pow_state = sleep;
2857:                         } else if (strcmp(argv[1], "standby") == 0) {
2858:                                 dmg->low_pow_state = standby;
2859:                         } else {
2860:                                 printk(DM_GECKO_PREFIX
2861:                                        "invalid set-low-power parameter: %s\n",
2862:                                        argv[1]);
2863:                                 goto bad;
2864:                         }
2865:                 }
2866:                 clear_bit(DM_GECKO_READ_TPUT, &dmg->flags);
2867:                 sched_delayed_power_adjustment_for_segments(dmg);
2868:         } else if (strcmp(argv[0], "set-high-read-tput") == 0) {
2869:                 set_bit(DM_GECKO_READ_TPUT, &dmg->flags);
2870:                 /* Need not need to power-up the mirrored disks
2871:                  * explicitly, the Linux IDE driver is supposed to
2872:                  * issue the reset lazyly and on demand. Do it anyway. */
2873:                  power_up_all_segments(dmg);
2874:         } else if (strcmp(argv[0], "gc-independent") == 0) {
2875:                 set_bit(DM_GECKO_INDEPENDENT_GC, &dmg->flags);
2876:                 power_up_segment(seg_for_sector(dmg,
2877:                                                 block_to_sector(dmg->tail)));
2878:         } else if (strcmp(argv[0], "gc-off") == 0) {
2879:                 set_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags);
2880:         } else if (strcmp(argv[0], "gc-on") == 0) {
2881:                 clear_bit(DM_GECKO_GC_FORCE_STOP, &dmg->flags);
2882:         } else if (strcmp(argv[0], "detail-on") == 0) {
2883:                 set_bit(DM_GECKO_STATUS_DETAILED, &dmg->flags);
2884:         } else if (strcmp(argv[0], "detail-off") == 0) {
2885:                 clear_bit(DM_GECKO_STATUS_DETAILED, &dmg->flags);
2886:         } else if (strcmp(argv[0], "sync-metadata") == 0) {
2887:                 do_sync_metadata(dmg);
2888:         } else if (strcmp(argv[0], "sync-metadata-asynchronously") == 0) {
2889:                 queue_work(gecko_sync_metadata_wqueue,
2890:                            &dmg->sync_metadata_work);
2891:         } else if (strcmp(argv[0], "set-gc-max-concurrent-requests") == 0) {
2892:                 int max_gc_concurrent_req;
2893:                 if (argc < 2) {
2894:                         ti->error =
2895:                           DM_GECKO_PREFIX "too few args (need one integer)";
2896:                         goto bad;
2897:                 }
2898:                 max_gc_concurrent_req = simple_strtol(argv[1], NULL, 10);
2899:                 if (max_gc_concurrent_req < MIN_GC_CONCURRENT_REQ ||
2900:                     max_gc_concurrent_req > MAX_GC_CONCURRENT_REQ) {
2901:                         ti->error =
2902:                           DM_GECKO_PREFIX "invalid argument (not in range)";
2903:                         goto bad;
2904:                 }
2905:                 dmg->max_gc_req_in_progress = max_gc_concurrent_req;
2906:         } else if (strcmp(argv[0], "set-gc-watermarks") == 0) {
2907:                 unsigned long low_gc_watermark, high_gc_watermark;
2908:                 if (argc < 3) {
2909:                         ti->error =
2910:                           DM_GECKO_PREFIX "too few args (need 2 watermarks)";
2911:                         goto bad;
2912:                 }
2913:                 low_gc_watermark = simple_strtoul(argv[1], NULL, 10);
2914:                 high_gc_watermark = simple_strtoul(argv[2], NULL, 10);
2915:                 if (low_gc_watermark >= high_gc_watermark) {
2916:                         ti->error =
2917:                             DM_GECKO_PREFIX "low watermark >= high watermark";
2918:                         goto bad;
2919:                 }
2920:                 dmg->gc_ctrl.low_watermark = low_gc_watermark;
2921:                 dmg->gc_ctrl.high_watermark = high_gc_watermark;
2922:         } else {
2923:                 ti->error = DM_GECKO_PREFIX "invalid dmsetup message";
2924:                 goto bad;
2925:         }
2926: 
2927:         return 0;
2928: bad:
2929:         printk("%s\n", ti->error);
2930:         return -EINVAL;
2931: }
2932: 
2933: static struct target_type gecko_target = {
2934:         .name = "gecko",
2935:         .version = {1, 0, 1},
2936:         .module = THIS_MODULE,
2937:         .ctr = gecko_ctr,
2938:         .dtr = gecko_dtr,
2939:         .map = gecko_map,
2940:         .status = gecko_status,
2941:         .message = gecko_message,
2942: };
2943: 
2944: static int __init dm_gecko_init(void)
2945: {
2946:         int err = -ENOMEM;
2947: 
2948: #ifdef CONFIG_KALLSYMS
2949:         unsigned long sys_rename_addr = kallsyms_lookup_name("sys_rename");
2950:         if (sys_rename_addr == 0) {
2951:                 printk(DM_GECKO_PREFIX "Unable to lookup sys_rename symbol\n");
2952:         } else {
2953:                 sys_rename_wrapper = (void *) sys_rename_addr;
2954:                 printk(DM_GECKO_PREFIX "Found sys_rename at address 0x%p\n",
2955:                        sys_rename_wrapper);
2956:         }
2957: #elif defined SYS_RENAME_EXPORTED_TO_MODULES
2958:         sys_rename_wrapper = sys_rename;
2959: #endif
2960: 
2961:         /* init global resources for all gecko targets at module load
2962:          * time */
2963:         if ((err = dmg_kcopyd_init()) != 0) {
2964:                 printk(DM_GECKO_PREFIX "Unable to init kcopyd\n");
2965:                 goto out1;
2966:         }
2967:         if (!(io_for_block_cache = KMEM_CACHE(io_for_block, 0))) {
2968:                 printk(DM_GECKO_PREFIX "Unable to alloc io_for_block cache\n");
2969:                 goto out1;
2970:         }
2971: 
2972:         if (!(io_job_cache = KMEM_CACHE(io_job, 0))) {
2973:                 printk(DM_GECKO_PREFIX "unable to alloc io_job cache\n");
2974:                 goto out2;
2975:         }
2976: 
2977:         io_for_block_mempool = mempool_create_slab_pool(MIN_JOBS_IN_POOL,
2978:                                                         io_for_block_cache);
2979:         if (!io_for_block_mempool) {
2980:                 printk(DM_GECKO_PREFIX
2981:                        "unable to alloc io_for_block mempool\n");
2982:                 goto out3;
2983:         }
2984: 
2985:         io_job_mempool = mempool_create_slab_pool(MIN_JOBS_IN_POOL,
2986:                                                   io_job_cache);
2987:         if (!io_job_mempool) {
2988:                 printk(DM_GECKO_PREFIX "unable to alloc io_job mempool\n");
2989:                 goto out4;
2990:         }
2991: 
2992:         /* The correctness of the algorithms rely on the assumption
2993:          * that gecko_wqueue is a singlethreaded workqueue. */
2994:         if (!(gecko_wqueue = create_singlethread_workqueue("geckod"))) {
2995:                 printk(DM_GECKO_PREFIX "unable to create geckod workqueue\n");
2996:                 goto out5;
2997:         }
2998:         INIT_WORK(&gecko_work, run_deferred_jobs);
2999: 
3000:         if (!(gecko_sync_metadata_wqueue =
3001:               create_singlethread_workqueue("geckod-meta"))) {
3002:                 printk(DM_GECKO_PREFIX
3003:                        "unable to create geckod-meta workqueue\n");
3004:                 goto out6;
3005:         }
3006: 
3007:         if ((err = dm_register_target(&gecko_target)) < 0) {
3008:                 printk(DM_GECKO_PREFIX "register target failed %d\n", err);
3009:                 goto out7;
3010:         }
3011: 
3012:         printk(DM_GECKO_PREFIX "module loaded\n");
3013:         return 0;
3014: out7:
3015:         destroy_workqueue(gecko_sync_metadata_wqueue);
3016: out6:
3017:         destroy_workqueue(gecko_wqueue);
3018: out5:
3019:         mempool_destroy(io_job_mempool);
3020: out4:
3021:         mempool_destroy(io_for_block_mempool);
3022: out3:
3023:         kmem_cache_destroy(io_job_cache);
3024: out2:
3025:         kmem_cache_destroy(io_for_block_cache);
3026: out1:
3027:         return err;
3028: }
3029: 
3030: static void __exit dm_gecko_exit(void)
3031: {
3032:         dm_unregister_target(&gecko_target);
3033:         BUG_ON(!list_empty(&deferred_jobs));
3034:         mempool_destroy(io_job_mempool);
3035:         mempool_destroy(io_for_block_mempool);
3036:         kmem_cache_destroy(io_job_cache);
3037:         kmem_cache_destroy(io_for_block_cache);
3038:         destroy_workqueue(gecko_wqueue);
3039:         destroy_workqueue(gecko_sync_metadata_wqueue);
3040:         dmg_kcopyd_exit();
3041:         printk(DM_GECKO_PREFIX "module unloaded\n");
3042: }
3043: 
3044: module_init(dm_gecko_init);
3045: module_exit(dm_gecko_exit);
3046: 
3047: MODULE_DESCRIPTION("Gecko: power saving log structured storage system");
3048: MODULE_AUTHOR("Tudor Marian <tudorm@cs.cornell.edu>");
3049: #ifndef MODULE_LICENSE
3050: #define MODULE_LICENSE(a)
3051: #endif
3052: MODULE_LICENSE("Dual BSD/GPL");
3053: