xref: /casio_MT6878_16.0.0_master/vnd/kernel-6.1/block/blk-core.c
HomeAnnotateLine# Scopes# Navigate#Raw Download
current directory
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
5 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
7 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8 * - July2000
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12 /*
13 * This handles all read/write requests to block devices
14 */
15 #include <linux/kernel.h>
16 #include <linux/module.h>
17 #include <linux/bio.h>
18 #include <linux/blkdev.h>
19 #include <linux/blk-pm.h>
20 #include <linux/blk-integrity.h>
21 #include <linux/highmem.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/kernel_stat.h>
25 #include <linux/string.h>
26 #include <linux/init.h>
27 #include <linux/completion.h>
28 #include <linux/slab.h>
29 #include <linux/swap.h>
30 #include <linux/writeback.h>
31 #include <linux/task_io_accounting_ops.h>
32 #include <linux/fault-inject.h>
33 #include <linux/list_sort.h>
34 #include <linux/delay.h>
35 #include <linux/ratelimit.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/t10-pi.h>
38 #include <linux/debugfs.h>
39 #include <linux/bpf.h>
40 #include <linux/part_stat.h>
41 #include <linux/sched/sysctl.h>
42 #include <linux/blk-crypto.h>
43
44 #define CREATE_TRACE_POINTS
45 #include <trace/events/block.h>
46
47 #include "blk.h"
48 #ifndef __GENKSYMS__
49 #include "blk-mq-debugfs.h"
50 #endif
51 #include "blk-mq-sched.h"
52 #include "blk-pm.h"
53 #include "blk-cgroup.h"
54 #include "blk-throttle.h"
55 #include "blk-ioprio.h"
56
57 #ifdef CONFIG_BLK_MQ_USE_LOCAL_THREAD
58 extern long bio_cnt; // total bio sumbit
59 extern long rt_bio_cnt; // total rt bio sumbit, part of bio_cnt
60 extern long ux_bio_cnt; // total ux bio sumbit, part of rt_bio_cnt
61 #endif
62
63 struct dentry *blk_debugfs_root;
64
65 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
66 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
67 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
68 EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
69 EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
70 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
71 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_queue);
72 EXPORT_TRACEPOINT_SYMBOL_GPL(block_getrq);
73 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_issue);
74 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_merge);
75 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_requeue);
76 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_complete);
77
78 DEFINE_IDA(blk_queue_ida);
79
80 /*
81 * For queue allocation
82 */
83 struct kmem_cache *blk_requestq_cachep;
84 struct kmem_cache *blk_requestq_srcu_cachep;
85
86 /*
87 * Controlling structure to kblockd
88 */
89 static struct workqueue_struct *kblockd_workqueue;
90
91 /**
92 * blk_queue_flag_set - atomically set a queue flag
93 * @flag: flag to be set
94 * @q: request queue
95 */
96 void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
97 {
98 set_bit(flag, &q->queue_flags);
99 }
100 EXPORT_SYMBOL(blk_queue_flag_set);
101
102 /**
103 * blk_queue_flag_clear - atomically clear a queue flag
104 * @flag: flag to be cleared
105 * @q: request queue
106 */
107 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
108 {
109 clear_bit(flag, &q->queue_flags);
110 }
111 EXPORT_SYMBOL(blk_queue_flag_clear);
112
113 /**
114 * blk_queue_flag_test_and_set - atomically test and set a queue flag
115 * @flag: flag to be set
116 * @q: request queue
117 *
118 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
119 * the flag was already set.
120 */
121 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
122 {
123 return test_and_set_bit(flag, &q->queue_flags);
124 }
125 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
126
127 #define REQ_OP_NAME(name) [REQ_OP_##name] = #name
128 static const char *const blk_op_name[] = {
129 REQ_OP_NAME(READ),
130 REQ_OP_NAME(WRITE),
131 REQ_OP_NAME(FLUSH),
132 REQ_OP_NAME(DISCARD),
133 REQ_OP_NAME(SECURE_ERASE),
134 REQ_OP_NAME(ZONE_RESET),
135 REQ_OP_NAME(ZONE_RESET_ALL),
136 REQ_OP_NAME(ZONE_OPEN),
137 REQ_OP_NAME(ZONE_CLOSE),
138 REQ_OP_NAME(ZONE_FINISH),
139 REQ_OP_NAME(ZONE_APPEND),
140 REQ_OP_NAME(WRITE_ZEROES),
141 REQ_OP_NAME(DRV_IN),
142 REQ_OP_NAME(DRV_OUT),
143 };
144 #undef REQ_OP_NAME
145
146 /**
147 * blk_op_str - Return string XXX in the REQ_OP_XXX.
148 * @op: REQ_OP_XXX.
149 *
150 * Description: Centralize block layer function to convert REQ_OP_XXX into
151 * string format. Useful in the debugging and tracing bio or request. For
152 * invalid REQ_OP_XXX it returns string "UNKNOWN".
153 */
154 inline const char *blk_op_str(enum req_op op)
155 {
156 const char *op_str = "UNKNOWN";
157
158 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
159 op_str = blk_op_name[op];
160
161 return op_str;
162 }
163 EXPORT_SYMBOL_GPL(blk_op_str);
164
165 static const struct {
166 int errno;
167 const char *name;
168 } blk_errors[] = {
169 [BLK_STS_OK] = { 0, "" },
170 [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
171 [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
172 [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
173 [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
174 [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
175 [BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
176 [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
177 [BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
178 [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
179 [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
180 [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
181 [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
182
183 /* device mapper special case, should not leak out: */
184 [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
185
186 /* zone device specific errors */
187 [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
188 [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
189
190 /* everything else not covered above: */
191 [BLK_STS_IOERR] = { -EIO, "I/O" },
192 };
193
194 blk_status_t errno_to_blk_status(int errno)
195 {
196 int i;
197
198 for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
199 if (blk_errors[i].errno == errno)
200 return (__force blk_status_t)i;
201 }
202
203 return BLK_STS_IOERR;
204 }
205 EXPORT_SYMBOL_GPL(errno_to_blk_status);
206
207 int blk_status_to_errno(blk_status_t status)
208 {
209 int idx = (__force int)status;
210
211 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
212 return -EIO;
213 return blk_errors[idx].errno;
214 }
215 EXPORT_SYMBOL_GPL(blk_status_to_errno);
216
217 const char *blk_status_to_str(blk_status_t status)
218 {
219 int idx = (__force int)status;
220
221 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
222 return "<null>";
223 return blk_errors[idx].name;
224 }
225
226 /**
227 * blk_sync_queue - cancel any pending callbacks on a queue
228 * @q: the queue
229 *
230 * Description:
231 * The block layer may perform asynchronous callback activity
232 * on a queue, such as calling the unplug function after a timeout.
233 * A block device may call blk_sync_queue to ensure that any
234 * such activity is cancelled, thus allowing it to release resources
235 * that the callbacks might use. The caller must already have made sure
236 * that its ->submit_bio will not re-add plugging prior to calling
237 * this function.
238 *
239 * This function does not cancel any asynchronous activity arising
240 * out of elevator or throttling code. That would require elevator_exit()
241 * and blkcg_exit_queue() to be called with queue lock initialized.
242 *
243 */
244 void blk_sync_queue(struct request_queue *q)
245 {
246 del_timer_sync(&q->timeout);
247 cancel_work_sync(&q->timeout_work);
248 }
249 EXPORT_SYMBOL(blk_sync_queue);
250
251 /**
252 * blk_set_pm_only - increment pm_only counter
253 * @q: request queue pointer
254 */
255 void blk_set_pm_only(struct request_queue *q)
256 {
257 atomic_inc(&q->pm_only);
258 }
259 EXPORT_SYMBOL_GPL(blk_set_pm_only);
260
261 void blk_clear_pm_only(struct request_queue *q)
262 {
263 int pm_only;
264
265 pm_only = atomic_dec_return(&q->pm_only);
266 WARN_ON_ONCE(pm_only < 0);
267 if (pm_only == 0)
268 wake_up_all(&q->mq_freeze_wq);
269 }
270 EXPORT_SYMBOL_GPL(blk_clear_pm_only);
271
272 /**
273 * blk_put_queue - decrement the request_queue refcount
274 * @q: the request_queue structure to decrement the refcount for
275 *
276 * Decrements the refcount of the request_queue kobject. When this reaches 0
277 * we'll have blk_release_queue() called.
278 *
279 * Context: Any context, but the last reference must not be dropped from
280 * atomic context.
281 */
282 void blk_put_queue(struct request_queue *q)
283 {
284 kobject_put(&q->kobj);
285 }
286 EXPORT_SYMBOL(blk_put_queue);
287
288 void blk_queue_start_drain(struct request_queue *q)
289 {
290 /*
291 * When queue DYING flag is set, we need to block new req
292 * entering queue, so we call blk_freeze_queue_start() to
293 * prevent I/O from crossing blk_queue_enter().
294 */
295 blk_freeze_queue_start(q);
296 if (queue_is_mq(q))
297 blk_mq_wake_waiters(q);
298 /* Make blk_queue_enter() reexamine the DYING flag. */
299 wake_up_all(&q->mq_freeze_wq);
300 }
301
302 /**
303 * blk_queue_enter() - try to increase q->q_usage_counter
304 * @q: request queue pointer
305 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
306 */
307 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
308 {
309 const bool pm = flags & BLK_MQ_REQ_PM;
310
311 while (!blk_try_enter_queue(q, pm)) {
312 if (flags & BLK_MQ_REQ_NOWAIT)
313 return -EAGAIN;
314
315 /*
316 * read pair of barrier in blk_freeze_queue_start(), we need to
317 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
318 * reading .mq_freeze_depth or queue dying flag, otherwise the
319 * following wait may never return if the two reads are
320 * reordered.
321 */
322 smp_rmb();
323 wait_event(q->mq_freeze_wq,
324 (!q->mq_freeze_depth &&
325 blk_pm_resume_queue(pm, q)) ||
326 blk_queue_dying(q));
327 if (blk_queue_dying(q))
328 return -ENODEV;
329 }
330
331 return 0;
332 }
333
334 int __bio_queue_enter(struct request_queue *q, struct bio *bio)
335 {
336 while (!blk_try_enter_queue(q, false)) {
337 struct gendisk *disk = bio->bi_bdev->bd_disk;
338
339 if (bio->bi_opf & REQ_NOWAIT) {
340 if (test_bit(GD_DEAD, &disk->state))
341 goto dead;
342 bio_wouldblock_error(bio);
343 return -EAGAIN;
344 }
345
346 /*
347 * read pair of barrier in blk_freeze_queue_start(), we need to
348 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
349 * reading .mq_freeze_depth or queue dying flag, otherwise the
350 * following wait may never return if the two reads are
351 * reordered.
352 */
353 smp_rmb();
354 wait_event(q->mq_freeze_wq,
355 (!q->mq_freeze_depth &&
356 blk_pm_resume_queue(false, q)) ||
357 test_bit(GD_DEAD, &disk->state));
358 if (test_bit(GD_DEAD, &disk->state))
359 goto dead;
360 }
361
362 return 0;
363 dead:
364 bio_io_error(bio);
365 return -ENODEV;
366 }
367
368 void blk_queue_exit(struct request_queue *q)
369 {
370 percpu_ref_put(&q->q_usage_counter);
371 }
372
373 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
374 {
375 struct request_queue *q =
376 container_of(ref, struct request_queue, q_usage_counter);
377
378 wake_up_all(&q->mq_freeze_wq);
379 }
380
381 static void blk_rq_timed_out_timer(struct timer_list *t)
382 {
383 struct request_queue *q = from_timer(q, t, timeout);
384
385 kblockd_schedule_work(&q->timeout_work);
386 }
387
388 static void blk_timeout_work(struct work_struct *work)
389 {
390 }
391
392 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
393 {
394 struct request_queue *q;
395
396 q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
397 GFP_KERNEL | __GFP_ZERO, node_id);
398 if (!q)
399 return NULL;
400
401 if (alloc_srcu) {
402 blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
403 if (init_srcu_struct(q->srcu) != 0)
404 goto fail_q;
405 }
406
407 q->last_merge = NULL;
408
409 q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
410 if (q->id < 0)
411 goto fail_srcu;
412
413 q->stats = blk_alloc_queue_stats();
414 if (!q->stats)
415 goto fail_id;
416
417 q->node = node_id;
418
419 atomic_set(&q->nr_active_requests_shared_tags, 0);
420
421 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
422 INIT_WORK(&q->timeout_work, blk_timeout_work);
423 INIT_LIST_HEAD(&q->icq_list);
424
425 kobject_init(&q->kobj, &blk_queue_ktype);
426
427 mutex_init(&q->debugfs_mutex);
428 mutex_init(&q->sysfs_lock);
429 mutex_init(&q->sysfs_dir_lock);
430 spin_lock_init(&q->queue_lock);
431
432 init_waitqueue_head(&q->mq_freeze_wq);
433 mutex_init(&q->mq_freeze_lock);
434
435 /*
436 * Init percpu_ref in atomic mode so that it's faster to shutdown.
437 * See blk_register_queue() for details.
438 */
439 if (percpu_ref_init(&q->q_usage_counter,
440 blk_queue_usage_counter_release,
441 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
442 goto fail_stats;
443
444 blk_set_default_limits(&q->limits);
445 q->nr_requests = BLKDEV_DEFAULT_RQ;
446
447 return q;
448
449 fail_stats:
450 blk_free_queue_stats(q->stats);
451 fail_id:
452 ida_free(&blk_queue_ida, q->id);
453 fail_srcu:
454 if (alloc_srcu)
455 cleanup_srcu_struct(q->srcu);
456 fail_q:
457 kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
458 return NULL;
459 }
460
461 /**
462 * blk_get_queue - increment the request_queue refcount
463 * @q: the request_queue structure to increment the refcount for
464 *
465 * Increment the refcount of the request_queue kobject.
466 *
467 * Context: Any context.
468 */
469 bool blk_get_queue(struct request_queue *q)
470 {
471 if (unlikely(blk_queue_dying(q)))
472 return false;
473 kobject_get(&q->kobj);
474 return true;
475 }
476 EXPORT_SYMBOL(blk_get_queue);
477
478 #ifdef CONFIG_FAIL_MAKE_REQUEST
479
480 static DECLARE_FAULT_ATTR(fail_make_request);
481
482 static int __init setup_fail_make_request(char *str)
483 {
484 return setup_fault_attr(&fail_make_request, str);
485 }
486 __setup("fail_make_request=", setup_fail_make_request);
487
488 bool should_fail_request(struct block_device *part, unsigned int bytes)
489 {
490 return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
491 }
492
493 static int __init fail_make_request_debugfs(void)
494 {
495 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
496 NULL, &fail_make_request);
497
498 return PTR_ERR_OR_ZERO(dir);
499 }
500
501 late_initcall(fail_make_request_debugfs);
502 #endif /* CONFIG_FAIL_MAKE_REQUEST */
503
504 static inline void bio_check_ro(struct bio *bio)
505 {
506 if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
507 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
508 return;
509 pr_warn_ratelimited("Trying to write to read-only block-device %pg\n",
510 bio->bi_bdev);
511 /* Older lvm-tools actually trigger this */
512 }
513 }
514
515 static noinline int should_fail_bio(struct bio *bio)
516 {
517 if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
518 return -EIO;
519 return 0;
520 }
521 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
522
523 /*
524 * Check whether this bio extends beyond the end of the device or partition.
525 * This may well happen - the kernel calls bread() without checking the size of
526 * the device, e.g., when mounting a file system.
527 */
528 static inline int bio_check_eod(struct bio *bio)
529 {
530 sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
531 unsigned int nr_sectors = bio_sectors(bio);
532
533 if (nr_sectors && maxsector &&
534 (nr_sectors > maxsector ||
535 bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
536 pr_info_ratelimited("%s: attempt to access beyond end of device\n"
537 "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
538 current->comm, bio->bi_bdev, bio->bi_opf,
539 bio->bi_iter.bi_sector, nr_sectors, maxsector);
540 return -EIO;
541 }
542 return 0;
543 }
544
545 /*
546 * Remap block n of partition p to block n+start(p) of the disk.
547 */
548 static int blk_partition_remap(struct bio *bio)
549 {
550 struct block_device *p = bio->bi_bdev;
551
552 if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
553 return -EIO;
554 if (bio_sectors(bio)) {
555 bio->bi_iter.bi_sector += p->bd_start_sect;
556 trace_block_bio_remap(bio, p->bd_dev,
557 bio->bi_iter.bi_sector -
558 p->bd_start_sect);
559 }
560 bio_set_flag(bio, BIO_REMAPPED);
561 return 0;
562 }
563
564 /*
565 * Check write append to a zoned block device.
566 */
567 static inline blk_status_t blk_check_zone_append(struct request_queue *q,
568 struct bio *bio)
569 {
570 int nr_sectors = bio_sectors(bio);
571
572 /* Only applicable to zoned block devices */
573 if (!bdev_is_zoned(bio->bi_bdev))
574 return BLK_STS_NOTSUPP;
575
576 /* The bio sector must point to the start of a sequential zone */
577 if (bio->bi_iter.bi_sector & (bdev_zone_sectors(bio->bi_bdev) - 1) ||
578 !bio_zone_is_seq(bio))
579 return BLK_STS_IOERR;
580
581 /*
582 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
583 * split and could result in non-contiguous sectors being written in
584 * different zones.
585 */
586 if (nr_sectors > q->limits.chunk_sectors)
587 return BLK_STS_IOERR;
588
589 /* Make sure the BIO is small enough and will not get split */
590 if (nr_sectors > q->limits.max_zone_append_sectors)
591 return BLK_STS_IOERR;
592
593 bio->bi_opf |= REQ_NOMERGE;
594
595 return BLK_STS_OK;
596 }
597
598 static void __submit_bio(struct bio *bio)
599 {
600 struct gendisk *disk = bio->bi_bdev->bd_disk;
601
602 if (unlikely(!blk_crypto_bio_prep(&bio)))
603 return;
604
605 if (!disk->fops->submit_bio) {
606 blk_mq_submit_bio(bio);
607 } else if (likely(bio_queue_enter(bio) == 0)) {
608 disk->fops->submit_bio(bio);
609 blk_queue_exit(disk->queue);
610 }
611 }
612
613 /*
614 * The loop in this function may be a bit non-obvious, and so deserves some
615 * explanation:
616 *
617 * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
618 * that), so we have a list with a single bio.
619 * - We pretend that we have just taken it off a longer list, so we assign
620 * bio_list to a pointer to the bio_list_on_stack, thus initialising the
621 * bio_list of new bios to be added. ->submit_bio() may indeed add some more
622 * bios through a recursive call to submit_bio_noacct. If it did, we find a
623 * non-NULL value in bio_list and re-enter the loop from the top.
624 * - In this case we really did just take the bio of the top of the list (no
625 * pretending) and so remove it from bio_list, and call into ->submit_bio()
626 * again.
627 *
628 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
629 * bio_list_on_stack[1] contains bios that were submitted before the current
630 * ->submit_bio, but that haven't been processed yet.
631 */
632 static void __submit_bio_noacct(struct bio *bio)
633 {
634 struct bio_list bio_list_on_stack[2];
635
636 BUG_ON(bio->bi_next);
637
638 bio_list_init(&bio_list_on_stack[0]);
639 current->bio_list = bio_list_on_stack;
640
641 do {
642 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
643 struct bio_list lower, same;
644
645 /*
646 * Create a fresh bio_list for all subordinate requests.
647 */
648 bio_list_on_stack[1] = bio_list_on_stack[0];
649 bio_list_init(&bio_list_on_stack[0]);
650
651 __submit_bio(bio);
652
653 /*
654 * Sort new bios into those for a lower level and those for the
655 * same level.
656 */
657 bio_list_init(&lower);
658 bio_list_init(&same);
659 while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
660 if (q == bdev_get_queue(bio->bi_bdev))
661 bio_list_add(&same, bio);
662 else
663 bio_list_add(&lower, bio);
664
665 /*
666 * Now assemble so we handle the lowest level first.
667 */
668 bio_list_merge(&bio_list_on_stack[0], &lower);
669 bio_list_merge(&bio_list_on_stack[0], &same);
670 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
671 } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
672
673 current->bio_list = NULL;
674 }
675
676 static void __submit_bio_noacct_mq(struct bio *bio)
677 {
678 struct bio_list bio_list[2] = { };
679
680 current->bio_list = bio_list;
681
682 do {
683 __submit_bio(bio);
684 } while ((bio = bio_list_pop(&bio_list[0])));
685
686 current->bio_list = NULL;
687 }
688
689 void submit_bio_noacct_nocheck(struct bio *bio)
690 {
691 blk_cgroup_bio_start(bio);
692 blkcg_bio_issue_init(bio);
693
694 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
695 trace_block_bio_queue(bio);
696 /*
697 * Now that enqueuing has been traced, we need to trace
698 * completion as well.
699 */
700 bio_set_flag(bio, BIO_TRACE_COMPLETION);
701 }
702
703 /*
704 * We only want one ->submit_bio to be active at a time, else stack
705 * usage with stacked devices could be a problem. Use current->bio_list
706 * to collect a list of requests submited by a ->submit_bio method while
707 * it is active, and then process them after it returned.
708 */
709 if (current->bio_list)
710 bio_list_add(¤t->bio_list[0], bio);
711 else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
712 __submit_bio_noacct_mq(bio);
713 else
714 __submit_bio_noacct(bio);
715 }
716
717 /**
718 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
719 * @bio: The bio describing the location in memory and on the device.
720 *
721 * This is a version of submit_bio() that shall only be used for I/O that is
722 * resubmitted to lower level drivers by stacking block drivers. All file
723 * systems and other upper level users of the block layer should use
724 * submit_bio() instead.
725 */
726 void submit_bio_noacct(struct bio *bio)
727 {
728 struct block_device *bdev = bio->bi_bdev;
729 struct request_queue *q = bdev_get_queue(bdev);
730 blk_status_t status = BLK_STS_IOERR;
731 struct blk_plug *plug;
732
733 might_sleep();
734
735 plug = blk_mq_plug(bio);
736 if (plug && plug->nowait)
737 bio->bi_opf |= REQ_NOWAIT;
738
739 /*
740 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
741 * if queue does not support NOWAIT.
742 */
743 if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
744 goto not_supported;
745
746 if (should_fail_bio(bio))
747 goto end_io;
748 bio_check_ro(bio);
749 if (!bio_flagged(bio, BIO_REMAPPED)) {
750 if (unlikely(bio_check_eod(bio)))
751 goto end_io;
752 if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
753 goto end_io;
754 }
755
756 /*
757 * Filter flush bio's early so that bio based drivers without flush
758 * support don't have to worry about them.
759 */
760 if (op_is_flush(bio->bi_opf) &&
761 !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
762 bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
763 if (!bio_sectors(bio)) {
764 status = BLK_STS_OK;
765 goto end_io;
766 }
767 }
768
769 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
770 bio_clear_polled(bio);
771
772 switch (bio_op(bio)) {
773 case REQ_OP_DISCARD:
774 if (!bdev_max_discard_sectors(bdev))
775 goto not_supported;
776 break;
777 case REQ_OP_SECURE_ERASE:
778 if (!bdev_max_secure_erase_sectors(bdev))
779 goto not_supported;
780 break;
781 case REQ_OP_ZONE_APPEND:
782 status = blk_check_zone_append(q, bio);
783 if (status != BLK_STS_OK)
784 goto end_io;
785 break;
786 case REQ_OP_ZONE_RESET:
787 case REQ_OP_ZONE_OPEN:
788 case REQ_OP_ZONE_CLOSE:
789 case REQ_OP_ZONE_FINISH:
790 if (!bdev_is_zoned(bio->bi_bdev))
791 goto not_supported;
792 break;
793 case REQ_OP_ZONE_RESET_ALL:
794 if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
795 goto not_supported;
796 break;
797 case REQ_OP_WRITE_ZEROES:
798 if (!q->limits.max_write_zeroes_sectors)
799 goto not_supported;
800 break;
801 default:
802 break;
803 }
804
805 if (blk_throtl_bio(bio))
806 return;
807 submit_bio_noacct_nocheck(bio);
808 return;
809
810 not_supported:
811 status = BLK_STS_NOTSUPP;
812 end_io:
813 bio->bi_status = status;
814 bio_endio(bio);
815 }
816 EXPORT_SYMBOL(submit_bio_noacct);
817
818 #ifdef CONFIG_BLK_MQ_USE_LOCAL_THREAD
819 extern bool test_task_ux(struct task_struct *task);
820 #endif
821
822 static void bio_set_ioprio(struct bio *bio)
823 {
824 /* Nobody set ioprio so far? Initialize it based on task's nice value */
825 if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
826 bio->bi_ioprio = get_current_ioprio();
827 blkcg_set_ioprio(bio);
828 #ifdef CONFIG_BLK_MQ_USE_LOCAL_THREAD
829 bio_cnt++;
830
831 if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_RT) {
832 rt_bio_cnt++;
833 } else if (test_task_ux(current)) {
834 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4);
835 rt_bio_cnt++;
836 ux_bio_cnt++;
837 }
838 #endif
839 }
840
841 /**
842 * submit_bio - submit a bio to the block device layer for I/O
843 * @bio: The &struct bio which describes the I/O
844 *
845 * submit_bio() is used to submit I/O requests to block devices. It is passed a
846 * fully set up &struct bio that describes the I/O that needs to be done. The
847 * bio will be send to the device described by the bi_bdev field.
848 *
849 * The success/failure status of the request, along with notification of
850 * completion, is delivered asynchronously through the ->bi_end_io() callback
851 * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
852 * been called.
853 */
854 void submit_bio(struct bio *bio)
855 {
856 if (blkcg_punt_bio_submit(bio))
857 return;
858
859 if (bio_op(bio) == REQ_OP_READ) {
860 task_io_account_read(bio->bi_iter.bi_size);
861 count_vm_events(PGPGIN, bio_sectors(bio));
862 } else if (bio_op(bio) == REQ_OP_WRITE) {
863 count_vm_events(PGPGOUT, bio_sectors(bio));
864 }
865
866 bio_set_ioprio(bio);
867 submit_bio_noacct(bio);
868 }
869 EXPORT_SYMBOL(submit_bio);
870
871 /**
872 * bio_poll - poll for BIO completions
873 * @bio: bio to poll for
874 * @iob: batches of IO
875 * @flags: BLK_POLL_* flags that control the behavior
876 *
877 * Poll for completions on queue associated with the bio. Returns number of
878 * completed entries found.
879 *
880 * Note: the caller must either be the context that submitted @bio, or
881 * be in a RCU critical section to prevent freeing of @bio.
882 */
883 int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
884 {
885 blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
886 struct block_device *bdev;
887 struct request_queue *q;
888 int ret = 0;
889
890 bdev = READ_ONCE(bio->bi_bdev);
891 if (!bdev)
892 return 0;
893
894 q = bdev_get_queue(bdev);
895 if (cookie == BLK_QC_T_NONE ||
896 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
897 return 0;
898
899 /*
900 * As the requests that require a zone lock are not plugged in the
901 * first place, directly accessing the plug instead of using
902 * blk_mq_plug() should not have any consequences during flushing for
903 * zoned devices.
904 */
905 blk_flush_plug(current->plug, false);
906
907 /*
908 * We need to be able to enter a frozen queue, similar to how
909 * timeouts also need to do that. If that is blocked, then we can
910 * have pending IO when a queue freeze is started, and then the
911 * wait for the freeze to finish will wait for polled requests to
912 * timeout as the poller is preventer from entering the queue and
913 * completing them. As long as we prevent new IO from being queued,
914 * that should be all that matters.
915 */
916 if (!percpu_ref_tryget(&q->q_usage_counter))
917 return 0;
918 if (queue_is_mq(q)) {
919 ret = blk_mq_poll(q, cookie, iob, flags);
920 } else {
921 struct gendisk *disk = q->disk;
922
923 if (disk && disk->fops->poll_bio)
924 ret = disk->fops->poll_bio(bio, iob, flags);
925 }
926 blk_queue_exit(q);
927 return ret;
928 }
929 EXPORT_SYMBOL_GPL(bio_poll);
930
931 /*
932 * Helper to implement file_operations.iopoll. Requires the bio to be stored
933 * in iocb->private, and cleared before freeing the bio.
934 */
935 int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
936 unsigned int flags)
937 {
938 struct bio *bio;
939 int ret = 0;
940
941 /*
942 * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
943 * point to a freshly allocated bio at this point. If that happens
944 * we have a few cases to consider:
945 *
946 * 1) the bio is beeing initialized and bi_bdev is NULL. We can just
947 * simply nothing in this case
948 * 2) the bio points to a not poll enabled device. bio_poll will catch
949 * this and return 0
950 * 3) the bio points to a poll capable device, including but not
951 * limited to the one that the original bio pointed to. In this
952 * case we will call into the actual poll method and poll for I/O,
953 * even if we don't need to, but it won't cause harm either.
954 *
955 * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
956 * is still allocated. Because partitions hold a reference to the whole
957 * device bdev and thus disk, the disk is also still valid. Grabbing
958 * a reference to the queue in bio_poll() ensures the hctxs and requests
959 * are still valid as well.
960 */
961 rcu_read_lock();
962 bio = READ_ONCE(kiocb->private);
963 if (bio)
964 ret = bio_poll(bio, iob, flags);
965 rcu_read_unlock();
966
967 return ret;
968 }
969 EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
970
971 void update_io_ticks(struct block_device *part, unsigned long now, bool end)
972 {
973 unsigned long stamp;
974 again:
975 stamp = READ_ONCE(part->bd_stamp);
976 if (unlikely(time_after(now, stamp)) &&
977 likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
978 (end || part_in_flight(part)))
979 __part_stat_add(part, io_ticks, now - stamp);
980
981 if (part->bd_partno) {
982 part = bdev_whole(part);
983 goto again;
984 }
985 }
986
987 unsigned long bdev_start_io_acct(struct block_device *bdev,
988 unsigned int sectors, enum req_op op,
989 unsigned long start_time)
990 {
991 const int sgrp = op_stat_group(op);
992
993 part_stat_lock();
994 update_io_ticks(bdev, start_time, false);
995 part_stat_inc(bdev, ios[sgrp]);
996 part_stat_add(bdev, sectors[sgrp], sectors);
997 part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
998 part_stat_unlock();
999
1000 return start_time;
1001 }
1002 EXPORT_SYMBOL(bdev_start_io_acct);
1003
1004 /**
1005 * bio_start_io_acct_time - start I/O accounting for bio based drivers
1006 * @bio: bio to start account for
1007 * @start_time: start time that should be passed back to bio_end_io_acct().
1008 */
1009 void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
1010 {
1011 bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
1012 bio_op(bio), start_time);
1013 }
1014 EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
1015
1016 /**
1017 * bio_start_io_acct - start I/O accounting for bio based drivers
1018 * @bio: bio to start account for
1019 *
1020 * Returns the start time that should be passed back to bio_end_io_acct().
1021 */
1022 unsigned long bio_start_io_acct(struct bio *bio)
1023 {
1024 return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
1025 bio_op(bio), jiffies);
1026 }
1027 EXPORT_SYMBOL_GPL(bio_start_io_acct);
1028
1029 void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
1030 unsigned long start_time)
1031 {
1032 const int sgrp = op_stat_group(op);
1033 unsigned long now = READ_ONCE(jiffies);
1034 unsigned long duration = now - start_time;
1035
1036 part_stat_lock();
1037 update_io_ticks(bdev, now, true);
1038 part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
1039 part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
1040 part_stat_unlock();
1041 }
1042 EXPORT_SYMBOL(bdev_end_io_acct);
1043
1044 void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
1045 struct block_device *orig_bdev)
1046 {
1047 bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
1048 }
1049 EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
1050
1051 /**
1052 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
1053 * @q : the queue of the device being checked
1054 *
1055 * Description:
1056 * Check if underlying low-level drivers of a device are busy.
1057 * If the drivers want to export their busy state, they must set own
1058 * exporting function using blk_queue_lld_busy() first.
1059 *
1060 * Basically, this function is used only by request stacking drivers
1061 * to stop dispatching requests to underlying devices when underlying
1062 * devices are busy. This behavior helps more I/O merging on the queue
1063 * of the request stacking driver and prevents I/O throughput regression
1064 * on burst I/O load.
1065 *
1066 * Return:
1067 * 0 - Not busy (The request stacking driver should dispatch request)
1068 * 1 - Busy (The request stacking driver should stop dispatching request)
1069 */
1070 int blk_lld_busy(struct request_queue *q)
1071 {
1072 if (queue_is_mq(q) && q->mq_ops->busy)
1073 return q->mq_ops->busy(q);
1074
1075 return 0;
1076 }
1077 EXPORT_SYMBOL_GPL(blk_lld_busy);
1078
1079 int kblockd_schedule_work(struct work_struct *work)
1080 {
1081 return queue_work(kblockd_workqueue, work);
1082 }
1083 EXPORT_SYMBOL(kblockd_schedule_work);
1084
1085 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1086 unsigned long delay)
1087 {
1088 return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
1089 }
1090 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1091
1092 void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
1093 {
1094 struct task_struct *tsk = current;
1095
1096 /*
1097 * If this is a nested plug, don't actually assign it.
1098 */
1099 if (tsk->plug)
1100 return;
1101
1102 plug->mq_list = NULL;
1103 plug->cached_rq = NULL;
1104 plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
1105 plug->rq_count = 0;
1106 plug->multiple_queues = false;
1107 plug->has_elevator = false;
1108 plug->nowait = false;
1109 INIT_LIST_HEAD(&plug->cb_list);
1110
1111 /*
1112 * Store ordering should not be needed here, since a potential
1113 * preempt will imply a full memory barrier
1114 */
1115 tsk->plug = plug;
1116 }
1117
1118 /**
1119 * blk_start_plug - initialize blk_plug and track it inside the task_struct
1120 * @plug: The &struct blk_plug that needs to be initialized
1121 *
1122 * Description:
1123 * blk_start_plug() indicates to the block layer an intent by the caller
1124 * to submit multiple I/O requests in a batch. The block layer may use
1125 * this hint to defer submitting I/Os from the caller until blk_finish_plug()
1126 * is called. However, the block layer may choose to submit requests
1127 * before a call to blk_finish_plug() if the number of queued I/Os
1128 * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1129 * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1130 * the task schedules (see below).
1131 *
1132 * Tracking blk_plug inside the task_struct will help with auto-flushing the
1133 * pending I/O should the task end up blocking between blk_start_plug() and
1134 * blk_finish_plug(). This is important from a performance perspective, but
1135 * also ensures that we don't deadlock. For instance, if the task is blocking
1136 * for a memory allocation, memory reclaim could end up wanting to free a
1137 * page belonging to that request that is currently residing in our private
1138 * plug. By flushing the pending I/O when the process goes to sleep, we avoid
1139 * this kind of deadlock.
1140 */
1141 void blk_start_plug(struct blk_plug *plug)
1142 {
1143 blk_start_plug_nr_ios(plug, 1);
1144 }
1145 EXPORT_SYMBOL(blk_start_plug);
1146
1147 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
1148 {
1149 LIST_HEAD(callbacks);
1150
1151 while (!list_empty(&plug->cb_list)) {
1152 list_splice_init(&plug->cb_list, &callbacks);
1153
1154 while (!list_empty(&callbacks)) {
1155 struct blk_plug_cb *cb = list_first_entry(&callbacks,
1156 struct blk_plug_cb,
1157 list);
1158 list_del(&cb->list);
1159 cb->callback(cb, from_schedule);
1160 }
1161 }
1162 }
1163
1164 struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
1165 int size)
1166 {
1167 struct blk_plug *plug = current->plug;
1168 struct blk_plug_cb *cb;
1169
1170 if (!plug)
1171 return NULL;
1172
1173 list_for_each_entry(cb, &plug->cb_list, list)
1174 if (cb->callback == unplug && cb->data == data)
1175 return cb;
1176
1177 /* Not currently on the callback list */
1178 BUG_ON(size < sizeof(*cb));
1179 cb = kzalloc(size, GFP_ATOMIC);
1180 if (cb) {
1181 cb->data = data;
1182 cb->callback = unplug;
1183 list_add(&cb->list, &plug->cb_list);
1184 }
1185 return cb;
1186 }
1187 EXPORT_SYMBOL(blk_check_plugged);
1188
1189 void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
1190 {
1191 if (!list_empty(&plug->cb_list))
1192 flush_plug_callbacks(plug, from_schedule);
1193 blk_mq_flush_plug_list(plug, from_schedule);
1194 /*
1195 * Unconditionally flush out cached requests, even if the unplug
1196 * event came from schedule. Since we know hold references to the
1197 * queue for cached requests, we don't want a blocked task holding
1198 * up a queue freeze/quiesce event.
1199 */
1200 if (unlikely(!rq_list_empty(plug->cached_rq)))
1201 blk_mq_free_plug_rqs(plug);
1202 }
1203
1204 /**
1205 * blk_finish_plug - mark the end of a batch of submitted I/O
1206 * @plug: The &struct blk_plug passed to blk_start_plug()
1207 *
1208 * Description:
1209 * Indicate that a batch of I/O submissions is complete. This function
1210 * must be paired with an initial call to blk_start_plug(). The intent
1211 * is to allow the block layer to optimize I/O submission. See the
1212 * documentation for blk_start_plug() for more information.
1213 */
1214 void blk_finish_plug(struct blk_plug *plug)
1215 {
1216 if (plug == current->plug) {
1217 __blk_flush_plug(plug, false);
1218 current->plug = NULL;
1219 }
1220 }
1221 EXPORT_SYMBOL(blk_finish_plug);
1222
1223 void blk_io_schedule(void)
1224 {
1225 /* Prevent hang_check timer from firing at us during very long I/O */
1226 unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1227
1228 if (timeout)
1229 io_schedule_timeout(timeout);
1230 else
1231 io_schedule();
1232 }
1233 EXPORT_SYMBOL_GPL(blk_io_schedule);
1234
1235 int __init blk_dev_init(void)
1236 {
1237 #ifdef CONFIG_BLK_MQ_USE_LOCAL_THREAD
1238 const char *config = of_blk_feature_read("kblockd_ux_unbound_enable");
1239 #endif
1240 BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
1241 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1242 sizeof_field(struct request, cmd_flags));
1243 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
1244 sizeof_field(struct bio, bi_opf));
1245 BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
1246 __alignof__(struct request_queue)) !=
1247 sizeof(struct request_queue));
1248
1249 #ifdef CONFIG_BLK_MQ_USE_LOCAL_THREAD
1250 if (config && strcmp(config, "y") == 0)
1251 kblockd_workqueue = alloc_workqueue("kblockd",
1252 WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UX | WQ_UNBOUND, 0);
1253 else
1254 #endif
1255 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
1256 kblockd_workqueue = alloc_workqueue("kblockd",
1257 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1258 if (!kblockd_workqueue)
1259 panic("Failed to create kblockd\n");
1260
1261 blk_requestq_cachep = kmem_cache_create("request_queue",
1262 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
1263
1264 blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
1265 sizeof(struct request_queue) +
1266 sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
1267
1268 blk_debugfs_root = debugfs_create_dir("block", NULL);
1269 blk_mq_debugfs_init();
1270
1271 return 0;
1272 }
1273
served by {OpenGrok
Last Index Update: Sat Nov 22 16:34:38 CST 2025
submit_bio的代码流程,怎么提交为request