Linux3.10.0块IO子系统流程（3）-- SCSI策略例程

很长时间以来，Linux块设备使用了一种称为“蓄流/泄流”（plugging/unplugging）的技术来改进吞吐率。简单而言，这种工作方式类似浴盆排水系统的塞子。当IO被提交时，它被储存在一个队列，稍后的某个时间，我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。

 static void scsi_request_fn(struct request_queue *q)

 {

     struct scsi_device *sdev = q->queuedata;

     struct Scsi_Host *shost;

     struct scsi_cmnd *cmd;

     struct request *req;

     if(!get_device(&sdev->sdev_gendev))

         /* We must be tearing the block queue down already */

         return;

     /*

      * To start with, we keep looping until the queue is empty, or until

      * the host is no longer able to accept any more requests.

      */

     shost = sdev->host;

     for (;;) {

         int rtn;

         /*

          * get next queueable request.  We do this early to make sure

          * that the request is fully prepared even if we cannot

          * accept it.

          */

         req = blk_peek_request(q);    // 获得下一个可排队的请求，如果没有请求或者现在还不能想SCSI设备发送请求，则退出循环

         if (!req || !scsi_dev_queue_ready(q, sdev))

             break;

         /* 如果设备已经离线，则输出错误消息， 调用scsi_kill_request函数释放请求，并以此方式处理后面所有的请求 */

         if (unlikely(!scsi_device_online(sdev))) {

             sdev_printk(KERN_ERR, sdev,

                     "rejecting I/O to offline device\n");

             scsi_kill_request(req, q);

             continue;

         }

         /*

          * Remove the request from the request list.

          * 如果队列不是使用generic tag queueing，并且没有为请求启动tagged操作，调用blk_start_request开始由驱动处理请求，这个函数将请求从队列中取出，为它启动超时定时器

          */

         if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))    //

             blk_start_request(req);

         sdev->device_busy++;

         spin_unlock(q->queue_lock);

         /* 从块设备驱动层请求描述符的special域获得SCSI命令描述符，这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */

         cmd = req->special;

         if (unlikely(cmd == NULL)) {

             printk(KERN_CRIT "impossible request in %s.\n"

                      "please mail a stack trace to "

                      "linux-scsi@vger.kernel.org\n",

                      __func__);

             blk_dump_rq_flags(req, "foo");

             BUG();

         }

         spin_lock(shost->host_lock);

         /*

          * We hit this when the driver is using a host wide

          * tag map. For device level tag maps the queue_depth check

          * in the device ready fn would prevent us from trying

          * to allocate a tag. Since the map is a shared host resource

          * we add the dev to the starved list so it eventually gets

          * a run when a tag is freed.

          */

         if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {

             if (list_empty(&sdev->starved_entry))

                 list_add_tail(&sdev->starved_entry,

                           &shost->starved_list);

             goto not_ready;

         }

         if (!scsi_target_queue_ready(shost, sdev))

             goto not_ready;

         if (!scsi_host_queue_ready(q, shost, sdev))

             goto not_ready;

         scsi_target(sdev)->target_busy++;

         shost->host_busy++;

         /*

          * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will

          *        take the lock again.

          */

         spin_unlock_irq(shost->host_lock);

         /*

          * Finally, initialize any error handling parameters, and set up the timers for timeouts.

          * 初始化错误处理参数， 设置超时定时器

          */

         scsi_init_cmd_errh(cmd);

         /*

          * Dispatch the command to the low-level driver.

          * 将命令派发到底层驱动

          */

         rtn = scsi_dispatch_cmd(cmd);

         spin_lock_irq(q->queue_lock);

         if (rtn)

             goto out_delay;

     }

     goto out;

 not_ready:

     spin_unlock_irq(shost->host_lock);

     /*

      * lock q, handle tag, requeue req, and decrement device_busy. We

      * must return with queue_lock held.

      *

      * Decrementing device_busy without checking it is OK, as all such

      * cases (host limits or settings) should run the queue at some

      * later time.

      */

     spin_lock_irq(q->queue_lock);

     blk_requeue_request(q, req);

     sdev->device_busy--;

 out_delay:

     if (sdev->device_busy == )

         blk_delay_queue(q, SCSI_QUEUE_DELAY);

 out:

     /* must be careful here...if we trigger the ->remove() function

      * we cannot be holding the q lock */

     spin_unlock_irq(q->queue_lock);

     put_device(&sdev->sdev_gendev);

     spin_lock_irq(q->queue_lock);

 }

blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环，每次调用__elv_next_request从电梯队列中取出一个请求进行处理

 /**

 * blk_peek_request - peek at the top of a request queue

 * @q: request queue to peek at

 *

 * Description:

 *     Return the request at the top of @q.  The returned request

 *     should be started using blk_start_request() before LLD starts

 *     processing it.

 *

 * Return:

 *     Pointer to the request at the top of @q if available.  Null

 *     otherwise.

 *

 * Context:

 *     queue_lock must be held.

 */

 struct request *blk_peek_request(struct request_queue *q)

 {

     struct request *rq;

     int ret;

     while ((rq = __elv_next_request(q)) != NULL) {

         rq = blk_pm_peek_request(q, rq);

         if (!rq)

             break;

         /* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的，对于后一种情况，必然设置了REQ_STARTED标志。

           * 换句话说，如果没有该标志，则表示第一次看见此请求，如果请求被插入还需要排序，则调用elv_activate_rq函数确定合适执行该请求

           */

         if (!(rq->cmd_flags & REQ_STARTED)) {

             /*

              * This is the first time the device driver

              * sees this request (possibly after

              * requeueing).  Notify IO scheduler.

              */

             if (rq->cmd_flags & REQ_SORTED)

                 elv_activate_rq(q, rq);

             /*

              * just mark as started even if we don't start

              * it, a request that has been delayed should

              * not be passed by new incoming requests

              */

             rq->cmd_flags |= REQ_STARTED;

             trace_block_rq_issue(q, rq);

         }

         /* 配合IO调度器 */

         if (!q->boundary_rq || q->boundary_rq == rq) {

             q->end_sector = rq_end_sector(rq);

             q->boundary_rq = NULL;

         }

         /* 如果请求队列设置了REQ_DONTPREP，表明不需要准备SCSI命令，退出循环，向调用者返回这个请求 */

         if (rq->cmd_flags & REQ_DONTPREP)

             break;

         /*

           * 如果请求队列的dma_drain_size不为0，说明存在“过剩DMA”问题，这种情况下，需要为请求增加一个额外的段

           * 以便将来在聚散列表后追加“抽干缓冲区”

           */

         if (q->dma_drain_size && blk_rq_bytes(rq)) {

             /*

              * make sure space for the drain appears we

              * know we can do this because max_hw_segments

              * has been adjusted to be one fewer than the

              * device can handle

              */

             rq->nr_phys_segments++;

         }

         /*

           * 如果没有定义 prep_rq_fn回调，则返回

           * 否则调用回调为请求准备SCSI命令描述符，它有三种返回值：

           *     BLKPREP_OK：表示命令初期准备成功

           *     BLKPREP_DEFER：表示暂时还不能继续处理，需要将命令重新排入队列

           *     BLKPREP_KILL：该请求没办法继续处理，上上层报告IO错误，这里不退出循环，而是继续尝试下一个请求

           */

         if (!q->prep_rq_fn)

             break;

         ret = q->prep_rq_fn(q, rq);

         if (ret == BLKPREP_OK) {

             break;

         } else if (ret == BLKPREP_DEFER) {

             /*

              * the request may have been (partially) prepped.

              * we need to keep this request in the front to

              * avoid resource deadlock.  REQ_STARTED will

              * prevent other fs requests from passing this one.

              */

             if (q->dma_drain_size && blk_rq_bytes(rq) &&

                 !(rq->cmd_flags & REQ_DONTPREP)) {

                 /*

                  * remove the space for the drain we added

                  * so that we don't add it again

                  */

                 --rq->nr_phys_segments;

             }

             rq = NULL;

             break;

         } else if (ret == BLKPREP_KILL) {

             rq->cmd_flags |= REQ_QUIET;

             /*

              * Mark this request as started so we don't trigger

              * any debug logic in the end I/O path.

              */

             blk_start_request(rq);

             __blk_end_request_all(rq, -EIO);

         } else {

             printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);

             break;

         }

     }

     return rq;

 }

请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法，prep_rq_fn回调函数关键有两个任务：

构造命令描述块
如果需要的话为数据传输准备聚散列表

命令描述块和聚散列表都被封装到SCSI命令描述符中，我们知道，请求至少有两个来源

来自上层bio
来自SCSI公共服务层

在刚找到SCSI设备为其初始化请求队列时，这个回调函数被设置为scsi_prep_fn

 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)

 {

     struct request_queue *q;

     q = __scsi_alloc_queue(sdev->host, scsi_request_fn);

     if (!q)

         return NULL;

     blk_queue_prep_rq(q, scsi_prep_fn);

     blk_queue_softirq_done(q, scsi_softirq_done);

     blk_queue_rq_timed_out(q, scsi_times_out);

     blk_queue_lld_busy(q, scsi_lld_busy);

     return q;

 }

 /**

 * blk_queue_prep_rq - set a prepare_request function for queue

 * @q:        queue

 * @pfn:    prepare_request function

 *

 * It's possible for a queue to register a prepare_request callback which

 * is invoked before the request is handed to the request_fn. The goal of

 * the function is to prepare a request for I/O, it can be used to build a

 * cdb from the request data for instance.

 *

 */

 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)

 {

     q->prep_rq_fn = pfn;

 }

初始化回调

如果SCSI设备被高层驱动绑定，这个回调函数会被修改，例如，在sd_probe中被设置成sd_prep_fn

 static void sd_probe_async(void *data, async_cookie_t cookie)

 {

     struct scsi_disk *sdkp = data;

     struct scsi_device *sdp;

     struct gendisk *gd;

     u32 index;

     struct device *dev;

     sdp = sdkp->device;

     gd = sdkp->disk;

     index = sdkp->index;

     dev = &sdp->sdev_gendev;

     gd->major = sd_major((index & 0xf0) >> );

     gd->first_minor = ((index & 0xf) << ) | (index & 0xfff00);

     gd->minors = SD_MINORS;

     gd->fops = &sd_fops;

     gd->private_data = &sdkp->driver;

     gd->queue = sdkp->device->request_queue;

     /* defaults, until the device tells us otherwise */

     sdp->sector_size = ;

     sdkp->capacity = ;

     sdkp->media_present = ;

     sdkp->write_prot = ;

     sdkp->cache_override = ;

     sdkp->WCE = ;

     sdkp->RCD = ;

     sdkp->ATO = ;

     sdkp->first_scan = ;

     sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;

     sd_revalidate_disk(gd);

     blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);

     blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);

     gd->driverfs_dev = &sdp->sdev_gendev;

     gd->flags = GENHD_FL_EXT_DEVT;

     if (sdp->removable) {

         gd->flags |= GENHD_FL_REMOVABLE;

         gd->events |= DISK_EVENT_MEDIA_CHANGE;

     }

     add_disk(gd);

     if (sdkp->capacity)

         sd_dif_config_host(sdkp);

     sd_revalidate_disk(gd);

     sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",

           sdp->removable ? "removable " : "");

     blk_pm_runtime_init(sdp->request_queue, dev);

     scsi_autopm_put_device(sdp);

     put_device(&sdkp->dev);

 }

初始化回调

在前一种情况下，SCSI设备只能处理来自SCSI公共服务层的请求，后一种情况下，SCSI命令不仅能处理来自SCSI公共服务层的请求，还能够处理来自上层的bio请求，分析见下一节

秒客网

Linux3.10.0块IO子系统流程（3）-- SCSI策略例程

相关文章