0001
0002
0003 #include <assert.h>
0004 #include <errno.h>
0005 #include <error.h>
0006 #include <fcntl.h>
0007 #include <limits.h>
0008 #include <stdbool.h>
0009 #include <stdint.h>
0010 #include <stdio.h>
0011 #include <stdlib.h>
0012 #include <string.h>
0013 #include <unistd.h>
0014
0015 #include <arpa/inet.h>
0016 #include <linux/errqueue.h>
0017 #include <linux/if_packet.h>
0018 #include <linux/io_uring.h>
0019 #include <linux/ipv6.h>
0020 #include <linux/socket.h>
0021 #include <linux/sockios.h>
0022 #include <net/ethernet.h>
0023 #include <net/if.h>
0024 #include <netinet/in.h>
0025 #include <netinet/ip.h>
0026 #include <netinet/ip6.h>
0027 #include <netinet/tcp.h>
0028 #include <netinet/udp.h>
0029 #include <sys/ioctl.h>
0030 #include <sys/mman.h>
0031 #include <sys/resource.h>
0032 #include <sys/socket.h>
0033 #include <sys/stat.h>
0034 #include <sys/time.h>
0035 #include <sys/types.h>
0036 #include <sys/un.h>
0037 #include <sys/wait.h>
0038
0039 #define NOTIF_TAG 0xfffffffULL
0040 #define NONZC_TAG 0
0041 #define ZC_TAG 1
0042
0043 enum {
0044 MODE_NONZC = 0,
0045 MODE_ZC = 1,
0046 MODE_ZC_FIXED = 2,
0047 MODE_MIXED = 3,
0048 };
0049
0050 static bool cfg_cork = false;
0051 static int cfg_mode = MODE_ZC_FIXED;
0052 static int cfg_nr_reqs = 8;
0053 static int cfg_family = PF_UNSPEC;
0054 static int cfg_payload_len;
0055 static int cfg_port = 8000;
0056 static int cfg_runtime_ms = 4200;
0057
0058 static socklen_t cfg_alen;
0059 static struct sockaddr_storage cfg_dst_addr;
0060
0061 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
0062
0063 struct io_sq_ring {
0064 unsigned *head;
0065 unsigned *tail;
0066 unsigned *ring_mask;
0067 unsigned *ring_entries;
0068 unsigned *flags;
0069 unsigned *array;
0070 };
0071
0072 struct io_cq_ring {
0073 unsigned *head;
0074 unsigned *tail;
0075 unsigned *ring_mask;
0076 unsigned *ring_entries;
0077 struct io_uring_cqe *cqes;
0078 };
0079
0080 struct io_uring_sq {
0081 unsigned *khead;
0082 unsigned *ktail;
0083 unsigned *kring_mask;
0084 unsigned *kring_entries;
0085 unsigned *kflags;
0086 unsigned *kdropped;
0087 unsigned *array;
0088 struct io_uring_sqe *sqes;
0089
0090 unsigned sqe_head;
0091 unsigned sqe_tail;
0092
0093 size_t ring_sz;
0094 };
0095
0096 struct io_uring_cq {
0097 unsigned *khead;
0098 unsigned *ktail;
0099 unsigned *kring_mask;
0100 unsigned *kring_entries;
0101 unsigned *koverflow;
0102 struct io_uring_cqe *cqes;
0103
0104 size_t ring_sz;
0105 };
0106
0107 struct io_uring {
0108 struct io_uring_sq sq;
0109 struct io_uring_cq cq;
0110 int ring_fd;
0111 };
0112
0113 #ifdef __alpha__
0114 # ifndef __NR_io_uring_setup
0115 # define __NR_io_uring_setup 535
0116 # endif
0117 # ifndef __NR_io_uring_enter
0118 # define __NR_io_uring_enter 536
0119 # endif
0120 # ifndef __NR_io_uring_register
0121 # define __NR_io_uring_register 537
0122 # endif
0123 #else
0124 # ifndef __NR_io_uring_setup
0125 # define __NR_io_uring_setup 425
0126 # endif
0127 # ifndef __NR_io_uring_enter
0128 # define __NR_io_uring_enter 426
0129 # endif
0130 # ifndef __NR_io_uring_register
0131 # define __NR_io_uring_register 427
0132 # endif
0133 #endif
0134
0135 #if defined(__x86_64) || defined(__i386__)
0136 #define read_barrier() __asm__ __volatile__("":::"memory")
0137 #define write_barrier() __asm__ __volatile__("":::"memory")
0138 #else
0139
0140 #define read_barrier() __sync_synchronize()
0141 #define write_barrier() __sync_synchronize()
0142 #endif
0143
0144 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
0145 {
0146 return syscall(__NR_io_uring_setup, entries, p);
0147 }
0148
0149 static int io_uring_enter(int fd, unsigned int to_submit,
0150 unsigned int min_complete,
0151 unsigned int flags, sigset_t *sig)
0152 {
0153 return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
0154 flags, sig, _NSIG / 8);
0155 }
0156
0157 static int io_uring_register_buffers(struct io_uring *ring,
0158 const struct iovec *iovecs,
0159 unsigned nr_iovecs)
0160 {
0161 int ret;
0162
0163 ret = syscall(__NR_io_uring_register, ring->ring_fd,
0164 IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
0165 return (ret < 0) ? -errno : ret;
0166 }
0167
0168 static int io_uring_mmap(int fd, struct io_uring_params *p,
0169 struct io_uring_sq *sq, struct io_uring_cq *cq)
0170 {
0171 size_t size;
0172 void *ptr;
0173 int ret;
0174
0175 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
0176 ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
0177 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
0178 if (ptr == MAP_FAILED)
0179 return -errno;
0180 sq->khead = ptr + p->sq_off.head;
0181 sq->ktail = ptr + p->sq_off.tail;
0182 sq->kring_mask = ptr + p->sq_off.ring_mask;
0183 sq->kring_entries = ptr + p->sq_off.ring_entries;
0184 sq->kflags = ptr + p->sq_off.flags;
0185 sq->kdropped = ptr + p->sq_off.dropped;
0186 sq->array = ptr + p->sq_off.array;
0187
0188 size = p->sq_entries * sizeof(struct io_uring_sqe);
0189 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
0190 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
0191 if (sq->sqes == MAP_FAILED) {
0192 ret = -errno;
0193 err:
0194 munmap(sq->khead, sq->ring_sz);
0195 return ret;
0196 }
0197
0198 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
0199 ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
0200 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
0201 if (ptr == MAP_FAILED) {
0202 ret = -errno;
0203 munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
0204 goto err;
0205 }
0206 cq->khead = ptr + p->cq_off.head;
0207 cq->ktail = ptr + p->cq_off.tail;
0208 cq->kring_mask = ptr + p->cq_off.ring_mask;
0209 cq->kring_entries = ptr + p->cq_off.ring_entries;
0210 cq->koverflow = ptr + p->cq_off.overflow;
0211 cq->cqes = ptr + p->cq_off.cqes;
0212 return 0;
0213 }
0214
0215 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
0216 unsigned flags)
0217 {
0218 struct io_uring_params p;
0219 int fd, ret;
0220
0221 memset(ring, 0, sizeof(*ring));
0222 memset(&p, 0, sizeof(p));
0223 p.flags = flags;
0224
0225 fd = io_uring_setup(entries, &p);
0226 if (fd < 0)
0227 return fd;
0228 ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
0229 if (!ret)
0230 ring->ring_fd = fd;
0231 else
0232 close(fd);
0233 return ret;
0234 }
0235
0236 static int io_uring_submit(struct io_uring *ring)
0237 {
0238 struct io_uring_sq *sq = &ring->sq;
0239 const unsigned mask = *sq->kring_mask;
0240 unsigned ktail, submitted, to_submit;
0241 int ret;
0242
0243 read_barrier();
0244 if (*sq->khead != *sq->ktail) {
0245 submitted = *sq->kring_entries;
0246 goto submit;
0247 }
0248 if (sq->sqe_head == sq->sqe_tail)
0249 return 0;
0250
0251 ktail = *sq->ktail;
0252 to_submit = sq->sqe_tail - sq->sqe_head;
0253 for (submitted = 0; submitted < to_submit; submitted++) {
0254 read_barrier();
0255 sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
0256 }
0257 if (!submitted)
0258 return 0;
0259
0260 if (*sq->ktail != ktail) {
0261 write_barrier();
0262 *sq->ktail = ktail;
0263 write_barrier();
0264 }
0265 submit:
0266 ret = io_uring_enter(ring->ring_fd, submitted, 0,
0267 IORING_ENTER_GETEVENTS, NULL);
0268 return ret < 0 ? -errno : ret;
0269 }
0270
0271 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
0272 const void *buf, size_t len, int flags)
0273 {
0274 memset(sqe, 0, sizeof(*sqe));
0275 sqe->opcode = (__u8) IORING_OP_SEND;
0276 sqe->fd = sockfd;
0277 sqe->addr = (unsigned long) buf;
0278 sqe->len = len;
0279 sqe->msg_flags = (__u32) flags;
0280 }
0281
0282 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
0283 const void *buf, size_t len, int flags,
0284 unsigned zc_flags)
0285 {
0286 io_uring_prep_send(sqe, sockfd, buf, len, flags);
0287 sqe->opcode = (__u8) IORING_OP_SEND_ZC;
0288 sqe->ioprio = zc_flags;
0289 }
0290
0291 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
0292 {
0293 struct io_uring_sq *sq = &ring->sq;
0294
0295 if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
0296 return NULL;
0297 return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
0298 }
0299
0300 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
0301 {
0302 struct io_uring_cq *cq = &ring->cq;
0303 const unsigned mask = *cq->kring_mask;
0304 unsigned head = *cq->khead;
0305 int ret;
0306
0307 *cqe_ptr = NULL;
0308 do {
0309 read_barrier();
0310 if (head != *cq->ktail) {
0311 *cqe_ptr = &cq->cqes[head & mask];
0312 break;
0313 }
0314 ret = io_uring_enter(ring->ring_fd, 0, 1,
0315 IORING_ENTER_GETEVENTS, NULL);
0316 if (ret < 0)
0317 return -errno;
0318 } while (1);
0319
0320 return 0;
0321 }
0322
0323 static inline void io_uring_cqe_seen(struct io_uring *ring)
0324 {
0325 *(&ring->cq)->khead += 1;
0326 write_barrier();
0327 }
0328
0329 static unsigned long gettimeofday_ms(void)
0330 {
0331 struct timeval tv;
0332
0333 gettimeofday(&tv, NULL);
0334 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
0335 }
0336
0337 static void do_setsockopt(int fd, int level, int optname, int val)
0338 {
0339 if (setsockopt(fd, level, optname, &val, sizeof(val)))
0340 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
0341 }
0342
0343 static int do_setup_tx(int domain, int type, int protocol)
0344 {
0345 int fd;
0346
0347 fd = socket(domain, type, protocol);
0348 if (fd == -1)
0349 error(1, errno, "socket t");
0350
0351 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
0352
0353 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
0354 error(1, errno, "connect");
0355 return fd;
0356 }
0357
0358 static void do_tx(int domain, int type, int protocol)
0359 {
0360 struct io_uring_sqe *sqe;
0361 struct io_uring_cqe *cqe;
0362 unsigned long packets = 0, bytes = 0;
0363 struct io_uring ring;
0364 struct iovec iov;
0365 uint64_t tstop;
0366 int i, fd, ret;
0367 int compl_cqes = 0;
0368
0369 fd = do_setup_tx(domain, type, protocol);
0370
0371 ret = io_uring_queue_init(512, &ring, 0);
0372 if (ret)
0373 error(1, ret, "io_uring: queue init");
0374
0375 iov.iov_base = payload;
0376 iov.iov_len = cfg_payload_len;
0377
0378 ret = io_uring_register_buffers(&ring, &iov, 1);
0379 if (ret)
0380 error(1, ret, "io_uring: buffer registration");
0381
0382 tstop = gettimeofday_ms() + cfg_runtime_ms;
0383 do {
0384 if (cfg_cork)
0385 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
0386
0387 for (i = 0; i < cfg_nr_reqs; i++) {
0388 unsigned zc_flags = 0;
0389 unsigned buf_idx = 0;
0390 unsigned mode = cfg_mode;
0391 unsigned msg_flags = MSG_WAITALL;
0392
0393 if (cfg_mode == MODE_MIXED)
0394 mode = rand() % 3;
0395
0396 sqe = io_uring_get_sqe(&ring);
0397
0398 if (mode == MODE_NONZC) {
0399 io_uring_prep_send(sqe, fd, payload,
0400 cfg_payload_len, msg_flags);
0401 sqe->user_data = NONZC_TAG;
0402 } else {
0403 compl_cqes++;
0404 io_uring_prep_sendzc(sqe, fd, payload,
0405 cfg_payload_len,
0406 msg_flags, zc_flags);
0407 if (mode == MODE_ZC_FIXED) {
0408 sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
0409 sqe->buf_index = buf_idx;
0410 }
0411 sqe->user_data = ZC_TAG;
0412 }
0413 }
0414
0415 ret = io_uring_submit(&ring);
0416 if (ret != cfg_nr_reqs)
0417 error(1, ret, "submit");
0418
0419 if (cfg_cork)
0420 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
0421 for (i = 0; i < cfg_nr_reqs; i++) {
0422 ret = io_uring_wait_cqe(&ring, &cqe);
0423 if (ret)
0424 error(1, ret, "wait cqe");
0425
0426 if (cqe->user_data != NONZC_TAG &&
0427 cqe->user_data != ZC_TAG)
0428 error(1, -EINVAL, "invalid cqe->user_data");
0429
0430 if (cqe->flags & IORING_CQE_F_NOTIF) {
0431 if (cqe->flags & IORING_CQE_F_MORE)
0432 error(1, -EINVAL, "invalid notif flags");
0433 compl_cqes--;
0434 i--;
0435 } else if (cqe->res <= 0) {
0436 if (cqe->flags & IORING_CQE_F_MORE)
0437 error(1, cqe->res, "more with a failed send");
0438 error(1, cqe->res, "send failed");
0439 } else {
0440 if (cqe->user_data == ZC_TAG &&
0441 !(cqe->flags & IORING_CQE_F_MORE))
0442 error(1, cqe->res, "missing more flag");
0443 packets++;
0444 bytes += cqe->res;
0445 }
0446 io_uring_cqe_seen(&ring);
0447 }
0448 } while (gettimeofday_ms() < tstop);
0449
0450 while (compl_cqes) {
0451 ret = io_uring_wait_cqe(&ring, &cqe);
0452 if (ret)
0453 error(1, ret, "wait cqe");
0454 if (cqe->flags & IORING_CQE_F_MORE)
0455 error(1, -EINVAL, "invalid notif flags");
0456 if (!(cqe->flags & IORING_CQE_F_NOTIF))
0457 error(1, -EINVAL, "missing notif flag");
0458
0459 io_uring_cqe_seen(&ring);
0460 compl_cqes--;
0461 }
0462
0463 fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
0464 packets, bytes >> 20,
0465 packets / (cfg_runtime_ms / 1000),
0466 (bytes >> 20) / (cfg_runtime_ms / 1000));
0467
0468 if (close(fd))
0469 error(1, errno, "close");
0470 }
0471
0472 static void do_test(int domain, int type, int protocol)
0473 {
0474 int i;
0475
0476 for (i = 0; i < IP_MAXPACKET; i++)
0477 payload[i] = 'a' + (i % 26);
0478 do_tx(domain, type, protocol);
0479 }
0480
0481 static void usage(const char *filepath)
0482 {
0483 error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
0484 "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
0485 }
0486
0487 static void parse_opts(int argc, char **argv)
0488 {
0489 const int max_payload_len = sizeof(payload) -
0490 sizeof(struct ipv6hdr) -
0491 sizeof(struct tcphdr) -
0492 40 ;
0493 struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
0494 struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
0495 char *daddr = NULL;
0496 int c;
0497
0498 if (argc <= 1)
0499 usage(argv[0]);
0500 cfg_payload_len = max_payload_len;
0501
0502 while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
0503 switch (c) {
0504 case '4':
0505 if (cfg_family != PF_UNSPEC)
0506 error(1, 0, "Pass one of -4 or -6");
0507 cfg_family = PF_INET;
0508 cfg_alen = sizeof(struct sockaddr_in);
0509 break;
0510 case '6':
0511 if (cfg_family != PF_UNSPEC)
0512 error(1, 0, "Pass one of -4 or -6");
0513 cfg_family = PF_INET6;
0514 cfg_alen = sizeof(struct sockaddr_in6);
0515 break;
0516 case 'D':
0517 daddr = optarg;
0518 break;
0519 case 'p':
0520 cfg_port = strtoul(optarg, NULL, 0);
0521 break;
0522 case 's':
0523 cfg_payload_len = strtoul(optarg, NULL, 0);
0524 break;
0525 case 't':
0526 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
0527 break;
0528 case 'n':
0529 cfg_nr_reqs = strtoul(optarg, NULL, 0);
0530 break;
0531 case 'c':
0532 cfg_cork = strtol(optarg, NULL, 0);
0533 break;
0534 case 'm':
0535 cfg_mode = strtol(optarg, NULL, 0);
0536 break;
0537 }
0538 }
0539
0540 switch (cfg_family) {
0541 case PF_INET:
0542 memset(addr4, 0, sizeof(*addr4));
0543 addr4->sin_family = AF_INET;
0544 addr4->sin_port = htons(cfg_port);
0545 if (daddr &&
0546 inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
0547 error(1, 0, "ipv4 parse error: %s", daddr);
0548 break;
0549 case PF_INET6:
0550 memset(addr6, 0, sizeof(*addr6));
0551 addr6->sin6_family = AF_INET6;
0552 addr6->sin6_port = htons(cfg_port);
0553 if (daddr &&
0554 inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
0555 error(1, 0, "ipv6 parse error: %s", daddr);
0556 break;
0557 default:
0558 error(1, 0, "illegal domain");
0559 }
0560
0561 if (cfg_payload_len > max_payload_len)
0562 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
0563 if (optind != argc - 1)
0564 usage(argv[0]);
0565 }
0566
0567 int main(int argc, char **argv)
0568 {
0569 const char *cfg_test = argv[argc - 1];
0570
0571 parse_opts(argc, argv);
0572
0573 if (!strcmp(cfg_test, "tcp"))
0574 do_test(cfg_family, SOCK_STREAM, 0);
0575 else if (!strcmp(cfg_test, "udp"))
0576 do_test(cfg_family, SOCK_DGRAM, 0);
0577 else
0578 error(1, 0, "unknown cfg_test %s", cfg_test);
0579 return 0;
0580 }