Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Simple benchmark program that uses the various features of io_uring
0004  * to provide fast random access to a device/file. It has various
0005  * options that are control how we use io_uring, see the OPTIONS section
0006  * below. This uses the raw io_uring interface.
0007  *
0008  * Copyright (C) 2018-2019 Jens Axboe
0009  */
0010 #include <stdio.h>
0011 #include <errno.h>
0012 #include <assert.h>
0013 #include <stdlib.h>
0014 #include <stddef.h>
0015 #include <signal.h>
0016 #include <inttypes.h>
0017 
0018 #include <sys/types.h>
0019 #include <sys/stat.h>
0020 #include <sys/ioctl.h>
0021 #include <sys/syscall.h>
0022 #include <sys/resource.h>
0023 #include <sys/mman.h>
0024 #include <sys/uio.h>
0025 #include <linux/fs.h>
0026 #include <fcntl.h>
0027 #include <unistd.h>
0028 #include <string.h>
0029 #include <pthread.h>
0030 #include <sched.h>
0031 
0032 #include "liburing.h"
0033 #include "barrier.h"
0034 
0035 #define min(a, b)       ((a < b) ? (a) : (b))
0036 
0037 struct io_sq_ring {
0038     unsigned *head;
0039     unsigned *tail;
0040     unsigned *ring_mask;
0041     unsigned *ring_entries;
0042     unsigned *flags;
0043     unsigned *array;
0044 };
0045 
0046 struct io_cq_ring {
0047     unsigned *head;
0048     unsigned *tail;
0049     unsigned *ring_mask;
0050     unsigned *ring_entries;
0051     struct io_uring_cqe *cqes;
0052 };
0053 
0054 #define DEPTH           128
0055 
0056 #define BATCH_SUBMIT        32
0057 #define BATCH_COMPLETE      32
0058 
0059 #define BS          4096
0060 
0061 #define MAX_FDS         16
0062 
0063 static unsigned sq_ring_mask, cq_ring_mask;
0064 
0065 struct file {
0066     unsigned long max_blocks;
0067     unsigned pending_ios;
0068     int real_fd;
0069     int fixed_fd;
0070 };
0071 
0072 struct submitter {
0073     pthread_t thread;
0074     int ring_fd;
0075     struct drand48_data rand;
0076     struct io_sq_ring sq_ring;
0077     struct io_uring_sqe *sqes;
0078     struct iovec iovecs[DEPTH];
0079     struct io_cq_ring cq_ring;
0080     int inflight;
0081     unsigned long reaps;
0082     unsigned long done;
0083     unsigned long calls;
0084     volatile int finish;
0085 
0086     __s32 *fds;
0087 
0088     struct file files[MAX_FDS];
0089     unsigned nr_files;
0090     unsigned cur_file;
0091 };
0092 
0093 static struct submitter submitters[1];
0094 static volatile int finish;
0095 
0096 /*
0097  * OPTIONS: Set these to test the various features of io_uring.
0098  */
0099 static int polled = 1;      /* use IO polling */
0100 static int fixedbufs = 1;   /* use fixed user buffers */
0101 static int register_files = 1;  /* use fixed files */
0102 static int buffered = 0;    /* use buffered IO, not O_DIRECT */
0103 static int sq_thread_poll = 0;  /* use kernel submission/poller thread */
0104 static int sq_thread_cpu = -1;  /* pin above thread to this CPU */
0105 static int do_nop = 0;      /* no-op SQ ring commands */
0106 
0107 static int io_uring_register_buffers(struct submitter *s)
0108 {
0109     if (do_nop)
0110         return 0;
0111 
0112     return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs,
0113                     DEPTH);
0114 }
0115 
0116 static int io_uring_register_files(struct submitter *s)
0117 {
0118     unsigned i;
0119 
0120     if (do_nop)
0121         return 0;
0122 
0123     s->fds = calloc(s->nr_files, sizeof(__s32));
0124     for (i = 0; i < s->nr_files; i++) {
0125         s->fds[i] = s->files[i].real_fd;
0126         s->files[i].fixed_fd = i;
0127     }
0128 
0129     return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds,
0130                     s->nr_files);
0131 }
0132 
0133 static int lk_gettid(void)
0134 {
0135     return syscall(__NR_gettid);
0136 }
0137 
0138 static unsigned file_depth(struct submitter *s)
0139 {
0140     return (DEPTH + s->nr_files - 1) / s->nr_files;
0141 }
0142 
0143 static void init_io(struct submitter *s, unsigned index)
0144 {
0145     struct io_uring_sqe *sqe = &s->sqes[index];
0146     unsigned long offset;
0147     struct file *f;
0148     long r;
0149 
0150     if (do_nop) {
0151         sqe->opcode = IORING_OP_NOP;
0152         return;
0153     }
0154 
0155     if (s->nr_files == 1) {
0156         f = &s->files[0];
0157     } else {
0158         f = &s->files[s->cur_file];
0159         if (f->pending_ios >= file_depth(s)) {
0160             s->cur_file++;
0161             if (s->cur_file == s->nr_files)
0162                 s->cur_file = 0;
0163             f = &s->files[s->cur_file];
0164         }
0165     }
0166     f->pending_ios++;
0167 
0168     lrand48_r(&s->rand, &r);
0169     offset = (r % (f->max_blocks - 1)) * BS;
0170 
0171     if (register_files) {
0172         sqe->flags = IOSQE_FIXED_FILE;
0173         sqe->fd = f->fixed_fd;
0174     } else {
0175         sqe->flags = 0;
0176         sqe->fd = f->real_fd;
0177     }
0178     if (fixedbufs) {
0179         sqe->opcode = IORING_OP_READ_FIXED;
0180         sqe->addr = (unsigned long) s->iovecs[index].iov_base;
0181         sqe->len = BS;
0182         sqe->buf_index = index;
0183     } else {
0184         sqe->opcode = IORING_OP_READV;
0185         sqe->addr = (unsigned long) &s->iovecs[index];
0186         sqe->len = 1;
0187         sqe->buf_index = 0;
0188     }
0189     sqe->ioprio = 0;
0190     sqe->off = offset;
0191     sqe->user_data = (unsigned long) f;
0192 }
0193 
0194 static int prep_more_ios(struct submitter *s, unsigned max_ios)
0195 {
0196     struct io_sq_ring *ring = &s->sq_ring;
0197     unsigned index, tail, next_tail, prepped = 0;
0198 
0199     next_tail = tail = *ring->tail;
0200     do {
0201         next_tail++;
0202         read_barrier();
0203         if (next_tail == *ring->head)
0204             break;
0205 
0206         index = tail & sq_ring_mask;
0207         init_io(s, index);
0208         ring->array[index] = index;
0209         prepped++;
0210         tail = next_tail;
0211     } while (prepped < max_ios);
0212 
0213     if (*ring->tail != tail) {
0214         /* order tail store with writes to sqes above */
0215         write_barrier();
0216         *ring->tail = tail;
0217         write_barrier();
0218     }
0219     return prepped;
0220 }
0221 
0222 static int get_file_size(struct file *f)
0223 {
0224     struct stat st;
0225 
0226     if (fstat(f->real_fd, &st) < 0)
0227         return -1;
0228     if (S_ISBLK(st.st_mode)) {
0229         unsigned long long bytes;
0230 
0231         if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
0232             return -1;
0233 
0234         f->max_blocks = bytes / BS;
0235         return 0;
0236     } else if (S_ISREG(st.st_mode)) {
0237         f->max_blocks = st.st_size / BS;
0238         return 0;
0239     }
0240 
0241     return -1;
0242 }
0243 
0244 static int reap_events(struct submitter *s)
0245 {
0246     struct io_cq_ring *ring = &s->cq_ring;
0247     struct io_uring_cqe *cqe;
0248     unsigned head, reaped = 0;
0249 
0250     head = *ring->head;
0251     do {
0252         struct file *f;
0253 
0254         read_barrier();
0255         if (head == *ring->tail)
0256             break;
0257         cqe = &ring->cqes[head & cq_ring_mask];
0258         if (!do_nop) {
0259             f = (struct file *) (uintptr_t) cqe->user_data;
0260             f->pending_ios--;
0261             if (cqe->res != BS) {
0262                 printf("io: unexpected ret=%d\n", cqe->res);
0263                 if (polled && cqe->res == -EOPNOTSUPP)
0264                     printf("Your filesystem doesn't support poll\n");
0265                 return -1;
0266             }
0267         }
0268         reaped++;
0269         head++;
0270     } while (1);
0271 
0272     s->inflight -= reaped;
0273     *ring->head = head;
0274     write_barrier();
0275     return reaped;
0276 }
0277 
0278 static void *submitter_fn(void *data)
0279 {
0280     struct submitter *s = data;
0281     struct io_sq_ring *ring = &s->sq_ring;
0282     int ret, prepped;
0283 
0284     printf("submitter=%d\n", lk_gettid());
0285 
0286     srand48_r(pthread_self(), &s->rand);
0287 
0288     prepped = 0;
0289     do {
0290         int to_wait, to_submit, this_reap, to_prep;
0291 
0292         if (!prepped && s->inflight < DEPTH) {
0293             to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
0294             prepped = prep_more_ios(s, to_prep);
0295         }
0296         s->inflight += prepped;
0297 submit_more:
0298         to_submit = prepped;
0299 submit:
0300         if (to_submit && (s->inflight + to_submit <= DEPTH))
0301             to_wait = 0;
0302         else
0303             to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
0304 
0305         /*
0306          * Only need to call io_uring_enter if we're not using SQ thread
0307          * poll, or if IORING_SQ_NEED_WAKEUP is set.
0308          */
0309         if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
0310             unsigned flags = 0;
0311 
0312             if (to_wait)
0313                 flags = IORING_ENTER_GETEVENTS;
0314             if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
0315                 flags |= IORING_ENTER_SQ_WAKEUP;
0316             ret = io_uring_enter(s->ring_fd, to_submit, to_wait,
0317                         flags, NULL);
0318             s->calls++;
0319         }
0320 
0321         /*
0322          * For non SQ thread poll, we already got the events we needed
0323          * through the io_uring_enter() above. For SQ thread poll, we
0324          * need to loop here until we find enough events.
0325          */
0326         this_reap = 0;
0327         do {
0328             int r;
0329             r = reap_events(s);
0330             if (r == -1) {
0331                 s->finish = 1;
0332                 break;
0333             } else if (r > 0)
0334                 this_reap += r;
0335         } while (sq_thread_poll && this_reap < to_wait);
0336         s->reaps += this_reap;
0337 
0338         if (ret >= 0) {
0339             if (!ret) {
0340                 to_submit = 0;
0341                 if (s->inflight)
0342                     goto submit;
0343                 continue;
0344             } else if (ret < to_submit) {
0345                 int diff = to_submit - ret;
0346 
0347                 s->done += ret;
0348                 prepped -= diff;
0349                 goto submit_more;
0350             }
0351             s->done += ret;
0352             prepped = 0;
0353             continue;
0354         } else if (ret < 0) {
0355             if (errno == EAGAIN) {
0356                 if (s->finish)
0357                     break;
0358                 if (this_reap)
0359                     goto submit;
0360                 to_submit = 0;
0361                 goto submit;
0362             }
0363             printf("io_submit: %s\n", strerror(errno));
0364             break;
0365         }
0366     } while (!s->finish);
0367 
0368     finish = 1;
0369     return NULL;
0370 }
0371 
0372 static void sig_int(int sig)
0373 {
0374     printf("Exiting on signal %d\n", sig);
0375     submitters[0].finish = 1;
0376     finish = 1;
0377 }
0378 
0379 static void arm_sig_int(void)
0380 {
0381     struct sigaction act;
0382 
0383     memset(&act, 0, sizeof(act));
0384     act.sa_handler = sig_int;
0385     act.sa_flags = SA_RESTART;
0386     sigaction(SIGINT, &act, NULL);
0387 }
0388 
0389 static int setup_ring(struct submitter *s)
0390 {
0391     struct io_sq_ring *sring = &s->sq_ring;
0392     struct io_cq_ring *cring = &s->cq_ring;
0393     struct io_uring_params p;
0394     int ret, fd;
0395     void *ptr;
0396 
0397     memset(&p, 0, sizeof(p));
0398 
0399     if (polled && !do_nop)
0400         p.flags |= IORING_SETUP_IOPOLL;
0401     if (sq_thread_poll) {
0402         p.flags |= IORING_SETUP_SQPOLL;
0403         if (sq_thread_cpu != -1) {
0404             p.flags |= IORING_SETUP_SQ_AFF;
0405             p.sq_thread_cpu = sq_thread_cpu;
0406         }
0407     }
0408 
0409     fd = io_uring_setup(DEPTH, &p);
0410     if (fd < 0) {
0411         perror("io_uring_setup");
0412         return 1;
0413     }
0414     s->ring_fd = fd;
0415 
0416     if (fixedbufs) {
0417         ret = io_uring_register_buffers(s);
0418         if (ret < 0) {
0419             perror("io_uring_register_buffers");
0420             return 1;
0421         }
0422     }
0423 
0424     if (register_files) {
0425         ret = io_uring_register_files(s);
0426         if (ret < 0) {
0427             perror("io_uring_register_files");
0428             return 1;
0429         }
0430     }
0431 
0432     ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
0433             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
0434             IORING_OFF_SQ_RING);
0435     printf("sq_ring ptr = 0x%p\n", ptr);
0436     sring->head = ptr + p.sq_off.head;
0437     sring->tail = ptr + p.sq_off.tail;
0438     sring->ring_mask = ptr + p.sq_off.ring_mask;
0439     sring->ring_entries = ptr + p.sq_off.ring_entries;
0440     sring->flags = ptr + p.sq_off.flags;
0441     sring->array = ptr + p.sq_off.array;
0442     sq_ring_mask = *sring->ring_mask;
0443 
0444     s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
0445             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
0446             IORING_OFF_SQES);
0447     printf("sqes ptr    = 0x%p\n", s->sqes);
0448 
0449     ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
0450             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
0451             IORING_OFF_CQ_RING);
0452     printf("cq_ring ptr = 0x%p\n", ptr);
0453     cring->head = ptr + p.cq_off.head;
0454     cring->tail = ptr + p.cq_off.tail;
0455     cring->ring_mask = ptr + p.cq_off.ring_mask;
0456     cring->ring_entries = ptr + p.cq_off.ring_entries;
0457     cring->cqes = ptr + p.cq_off.cqes;
0458     cq_ring_mask = *cring->ring_mask;
0459     return 0;
0460 }
0461 
0462 static void file_depths(char *buf)
0463 {
0464     struct submitter *s = &submitters[0];
0465     unsigned i;
0466     char *p;
0467 
0468     buf[0] = '\0';
0469     p = buf;
0470     for (i = 0; i < s->nr_files; i++) {
0471         struct file *f = &s->files[i];
0472 
0473         if (i + 1 == s->nr_files)
0474             p += sprintf(p, "%d", f->pending_ios);
0475         else
0476             p += sprintf(p, "%d, ", f->pending_ios);
0477     }
0478 }
0479 
0480 int main(int argc, char *argv[])
0481 {
0482     struct submitter *s = &submitters[0];
0483     unsigned long done, calls, reap;
0484     int err, i, flags, fd;
0485     char *fdepths;
0486     void *ret;
0487 
0488     if (!do_nop && argc < 2) {
0489         printf("%s: filename\n", argv[0]);
0490         return 1;
0491     }
0492 
0493     flags = O_RDONLY | O_NOATIME;
0494     if (!buffered)
0495         flags |= O_DIRECT;
0496 
0497     i = 1;
0498     while (!do_nop && i < argc) {
0499         struct file *f;
0500 
0501         if (s->nr_files == MAX_FDS) {
0502             printf("Max number of files (%d) reached\n", MAX_FDS);
0503             break;
0504         }
0505         fd = open(argv[i], flags);
0506         if (fd < 0) {
0507             perror("open");
0508             return 1;
0509         }
0510 
0511         f = &s->files[s->nr_files];
0512         f->real_fd = fd;
0513         if (get_file_size(f)) {
0514             printf("failed getting size of device/file\n");
0515             return 1;
0516         }
0517         if (f->max_blocks <= 1) {
0518             printf("Zero file/device size?\n");
0519             return 1;
0520         }
0521         f->max_blocks--;
0522 
0523         printf("Added file %s\n", argv[i]);
0524         s->nr_files++;
0525         i++;
0526     }
0527 
0528     if (fixedbufs) {
0529         struct rlimit rlim;
0530 
0531         rlim.rlim_cur = RLIM_INFINITY;
0532         rlim.rlim_max = RLIM_INFINITY;
0533         if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
0534             perror("setrlimit");
0535             return 1;
0536         }
0537     }
0538 
0539     arm_sig_int();
0540 
0541     for (i = 0; i < DEPTH; i++) {
0542         void *buf;
0543 
0544         if (posix_memalign(&buf, BS, BS)) {
0545             printf("failed alloc\n");
0546             return 1;
0547         }
0548         s->iovecs[i].iov_base = buf;
0549         s->iovecs[i].iov_len = BS;
0550     }
0551 
0552     err = setup_ring(s);
0553     if (err) {
0554         printf("ring setup failed: %s, %d\n", strerror(errno), err);
0555         return 1;
0556     }
0557     printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
0558     printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
0559 
0560     pthread_create(&s->thread, NULL, submitter_fn, s);
0561 
0562     fdepths = malloc(8 * s->nr_files);
0563     reap = calls = done = 0;
0564     do {
0565         unsigned long this_done = 0;
0566         unsigned long this_reap = 0;
0567         unsigned long this_call = 0;
0568         unsigned long rpc = 0, ipc = 0;
0569 
0570         sleep(1);
0571         this_done += s->done;
0572         this_call += s->calls;
0573         this_reap += s->reaps;
0574         if (this_call - calls) {
0575             rpc = (this_done - done) / (this_call - calls);
0576             ipc = (this_reap - reap) / (this_call - calls);
0577         } else
0578             rpc = ipc = -1;
0579         file_depths(fdepths);
0580         printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
0581                 this_done - done, rpc, ipc, s->inflight,
0582                 fdepths);
0583         done = this_done;
0584         calls = this_call;
0585         reap = this_reap;
0586     } while (!finish);
0587 
0588     pthread_join(s->thread, &ret);
0589     close(s->ring_fd);
0590     free(fdepths);
0591     return 0;
0592 }