Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: MIT */
0002 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
0003 #include <assert.h>
0004 #include <errno.h>
0005 #include <error.h>
0006 #include <fcntl.h>
0007 #include <limits.h>
0008 #include <stdbool.h>
0009 #include <stdint.h>
0010 #include <stdio.h>
0011 #include <stdlib.h>
0012 #include <string.h>
0013 #include <unistd.h>
0014 
0015 #include <arpa/inet.h>
0016 #include <linux/errqueue.h>
0017 #include <linux/if_packet.h>
0018 #include <linux/io_uring.h>
0019 #include <linux/ipv6.h>
0020 #include <linux/socket.h>
0021 #include <linux/sockios.h>
0022 #include <net/ethernet.h>
0023 #include <net/if.h>
0024 #include <netinet/in.h>
0025 #include <netinet/ip.h>
0026 #include <netinet/ip6.h>
0027 #include <netinet/tcp.h>
0028 #include <netinet/udp.h>
0029 #include <sys/ioctl.h>
0030 #include <sys/mman.h>
0031 #include <sys/resource.h>
0032 #include <sys/socket.h>
0033 #include <sys/stat.h>
0034 #include <sys/time.h>
0035 #include <sys/types.h>
0036 #include <sys/un.h>
0037 #include <sys/wait.h>
0038 
0039 #define NOTIF_TAG 0xfffffffULL
0040 #define NONZC_TAG 0
0041 #define ZC_TAG 1
0042 
0043 enum {
0044     MODE_NONZC  = 0,
0045     MODE_ZC     = 1,
0046     MODE_ZC_FIXED   = 2,
0047     MODE_MIXED  = 3,
0048 };
0049 
0050 static bool cfg_cork        = false;
0051 static int  cfg_mode        = MODE_ZC_FIXED;
0052 static int  cfg_nr_reqs     = 8;
0053 static int  cfg_family      = PF_UNSPEC;
0054 static int  cfg_payload_len;
0055 static int  cfg_port        = 8000;
0056 static int  cfg_runtime_ms  = 4200;
0057 
0058 static socklen_t cfg_alen;
0059 static struct sockaddr_storage cfg_dst_addr;
0060 
0061 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
0062 
0063 struct io_sq_ring {
0064     unsigned *head;
0065     unsigned *tail;
0066     unsigned *ring_mask;
0067     unsigned *ring_entries;
0068     unsigned *flags;
0069     unsigned *array;
0070 };
0071 
0072 struct io_cq_ring {
0073     unsigned *head;
0074     unsigned *tail;
0075     unsigned *ring_mask;
0076     unsigned *ring_entries;
0077     struct io_uring_cqe *cqes;
0078 };
0079 
0080 struct io_uring_sq {
0081     unsigned *khead;
0082     unsigned *ktail;
0083     unsigned *kring_mask;
0084     unsigned *kring_entries;
0085     unsigned *kflags;
0086     unsigned *kdropped;
0087     unsigned *array;
0088     struct io_uring_sqe *sqes;
0089 
0090     unsigned sqe_head;
0091     unsigned sqe_tail;
0092 
0093     size_t ring_sz;
0094 };
0095 
0096 struct io_uring_cq {
0097     unsigned *khead;
0098     unsigned *ktail;
0099     unsigned *kring_mask;
0100     unsigned *kring_entries;
0101     unsigned *koverflow;
0102     struct io_uring_cqe *cqes;
0103 
0104     size_t ring_sz;
0105 };
0106 
0107 struct io_uring {
0108     struct io_uring_sq sq;
0109     struct io_uring_cq cq;
0110     int ring_fd;
0111 };
0112 
0113 #ifdef __alpha__
0114 # ifndef __NR_io_uring_setup
0115 #  define __NR_io_uring_setup       535
0116 # endif
0117 # ifndef __NR_io_uring_enter
0118 #  define __NR_io_uring_enter       536
0119 # endif
0120 # ifndef __NR_io_uring_register
0121 #  define __NR_io_uring_register    537
0122 # endif
0123 #else /* !__alpha__ */
0124 # ifndef __NR_io_uring_setup
0125 #  define __NR_io_uring_setup       425
0126 # endif
0127 # ifndef __NR_io_uring_enter
0128 #  define __NR_io_uring_enter       426
0129 # endif
0130 # ifndef __NR_io_uring_register
0131 #  define __NR_io_uring_register    427
0132 # endif
0133 #endif
0134 
0135 #if defined(__x86_64) || defined(__i386__)
0136 #define read_barrier()  __asm__ __volatile__("":::"memory")
0137 #define write_barrier() __asm__ __volatile__("":::"memory")
0138 #else
0139 
0140 #define read_barrier()  __sync_synchronize()
0141 #define write_barrier() __sync_synchronize()
0142 #endif
0143 
0144 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
0145 {
0146     return syscall(__NR_io_uring_setup, entries, p);
0147 }
0148 
0149 static int io_uring_enter(int fd, unsigned int to_submit,
0150               unsigned int min_complete,
0151               unsigned int flags, sigset_t *sig)
0152 {
0153     return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
0154             flags, sig, _NSIG / 8);
0155 }
0156 
0157 static int io_uring_register_buffers(struct io_uring *ring,
0158                      const struct iovec *iovecs,
0159                      unsigned nr_iovecs)
0160 {
0161     int ret;
0162 
0163     ret = syscall(__NR_io_uring_register, ring->ring_fd,
0164               IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
0165     return (ret < 0) ? -errno : ret;
0166 }
0167 
0168 static int io_uring_mmap(int fd, struct io_uring_params *p,
0169              struct io_uring_sq *sq, struct io_uring_cq *cq)
0170 {
0171     size_t size;
0172     void *ptr;
0173     int ret;
0174 
0175     sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
0176     ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
0177            MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
0178     if (ptr == MAP_FAILED)
0179         return -errno;
0180     sq->khead = ptr + p->sq_off.head;
0181     sq->ktail = ptr + p->sq_off.tail;
0182     sq->kring_mask = ptr + p->sq_off.ring_mask;
0183     sq->kring_entries = ptr + p->sq_off.ring_entries;
0184     sq->kflags = ptr + p->sq_off.flags;
0185     sq->kdropped = ptr + p->sq_off.dropped;
0186     sq->array = ptr + p->sq_off.array;
0187 
0188     size = p->sq_entries * sizeof(struct io_uring_sqe);
0189     sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
0190             MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
0191     if (sq->sqes == MAP_FAILED) {
0192         ret = -errno;
0193 err:
0194         munmap(sq->khead, sq->ring_sz);
0195         return ret;
0196     }
0197 
0198     cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
0199     ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
0200             MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
0201     if (ptr == MAP_FAILED) {
0202         ret = -errno;
0203         munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
0204         goto err;
0205     }
0206     cq->khead = ptr + p->cq_off.head;
0207     cq->ktail = ptr + p->cq_off.tail;
0208     cq->kring_mask = ptr + p->cq_off.ring_mask;
0209     cq->kring_entries = ptr + p->cq_off.ring_entries;
0210     cq->koverflow = ptr + p->cq_off.overflow;
0211     cq->cqes = ptr + p->cq_off.cqes;
0212     return 0;
0213 }
0214 
0215 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
0216                    unsigned flags)
0217 {
0218     struct io_uring_params p;
0219     int fd, ret;
0220 
0221     memset(ring, 0, sizeof(*ring));
0222     memset(&p, 0, sizeof(p));
0223     p.flags = flags;
0224 
0225     fd = io_uring_setup(entries, &p);
0226     if (fd < 0)
0227         return fd;
0228     ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
0229     if (!ret)
0230         ring->ring_fd = fd;
0231     else
0232         close(fd);
0233     return ret;
0234 }
0235 
0236 static int io_uring_submit(struct io_uring *ring)
0237 {
0238     struct io_uring_sq *sq = &ring->sq;
0239     const unsigned mask = *sq->kring_mask;
0240     unsigned ktail, submitted, to_submit;
0241     int ret;
0242 
0243     read_barrier();
0244     if (*sq->khead != *sq->ktail) {
0245         submitted = *sq->kring_entries;
0246         goto submit;
0247     }
0248     if (sq->sqe_head == sq->sqe_tail)
0249         return 0;
0250 
0251     ktail = *sq->ktail;
0252     to_submit = sq->sqe_tail - sq->sqe_head;
0253     for (submitted = 0; submitted < to_submit; submitted++) {
0254         read_barrier();
0255         sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
0256     }
0257     if (!submitted)
0258         return 0;
0259 
0260     if (*sq->ktail != ktail) {
0261         write_barrier();
0262         *sq->ktail = ktail;
0263         write_barrier();
0264     }
0265 submit:
0266     ret = io_uring_enter(ring->ring_fd, submitted, 0,
0267                 IORING_ENTER_GETEVENTS, NULL);
0268     return ret < 0 ? -errno : ret;
0269 }
0270 
0271 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
0272                       const void *buf, size_t len, int flags)
0273 {
0274     memset(sqe, 0, sizeof(*sqe));
0275     sqe->opcode = (__u8) IORING_OP_SEND;
0276     sqe->fd = sockfd;
0277     sqe->addr = (unsigned long) buf;
0278     sqe->len = len;
0279     sqe->msg_flags = (__u32) flags;
0280 }
0281 
0282 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
0283                         const void *buf, size_t len, int flags,
0284                         unsigned zc_flags)
0285 {
0286     io_uring_prep_send(sqe, sockfd, buf, len, flags);
0287     sqe->opcode = (__u8) IORING_OP_SEND_ZC;
0288     sqe->ioprio = zc_flags;
0289 }
0290 
0291 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
0292 {
0293     struct io_uring_sq *sq = &ring->sq;
0294 
0295     if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
0296         return NULL;
0297     return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
0298 }
0299 
0300 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
0301 {
0302     struct io_uring_cq *cq = &ring->cq;
0303     const unsigned mask = *cq->kring_mask;
0304     unsigned head = *cq->khead;
0305     int ret;
0306 
0307     *cqe_ptr = NULL;
0308     do {
0309         read_barrier();
0310         if (head != *cq->ktail) {
0311             *cqe_ptr = &cq->cqes[head & mask];
0312             break;
0313         }
0314         ret = io_uring_enter(ring->ring_fd, 0, 1,
0315                     IORING_ENTER_GETEVENTS, NULL);
0316         if (ret < 0)
0317             return -errno;
0318     } while (1);
0319 
0320     return 0;
0321 }
0322 
0323 static inline void io_uring_cqe_seen(struct io_uring *ring)
0324 {
0325     *(&ring->cq)->khead += 1;
0326     write_barrier();
0327 }
0328 
0329 static unsigned long gettimeofday_ms(void)
0330 {
0331     struct timeval tv;
0332 
0333     gettimeofday(&tv, NULL);
0334     return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
0335 }
0336 
0337 static void do_setsockopt(int fd, int level, int optname, int val)
0338 {
0339     if (setsockopt(fd, level, optname, &val, sizeof(val)))
0340         error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
0341 }
0342 
0343 static int do_setup_tx(int domain, int type, int protocol)
0344 {
0345     int fd;
0346 
0347     fd = socket(domain, type, protocol);
0348     if (fd == -1)
0349         error(1, errno, "socket t");
0350 
0351     do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
0352 
0353     if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
0354         error(1, errno, "connect");
0355     return fd;
0356 }
0357 
0358 static void do_tx(int domain, int type, int protocol)
0359 {
0360     struct io_uring_sqe *sqe;
0361     struct io_uring_cqe *cqe;
0362     unsigned long packets = 0, bytes = 0;
0363     struct io_uring ring;
0364     struct iovec iov;
0365     uint64_t tstop;
0366     int i, fd, ret;
0367     int compl_cqes = 0;
0368 
0369     fd = do_setup_tx(domain, type, protocol);
0370 
0371     ret = io_uring_queue_init(512, &ring, 0);
0372     if (ret)
0373         error(1, ret, "io_uring: queue init");
0374 
0375     iov.iov_base = payload;
0376     iov.iov_len = cfg_payload_len;
0377 
0378     ret = io_uring_register_buffers(&ring, &iov, 1);
0379     if (ret)
0380         error(1, ret, "io_uring: buffer registration");
0381 
0382     tstop = gettimeofday_ms() + cfg_runtime_ms;
0383     do {
0384         if (cfg_cork)
0385             do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
0386 
0387         for (i = 0; i < cfg_nr_reqs; i++) {
0388             unsigned zc_flags = 0;
0389             unsigned buf_idx = 0;
0390             unsigned mode = cfg_mode;
0391             unsigned msg_flags = MSG_WAITALL;
0392 
0393             if (cfg_mode == MODE_MIXED)
0394                 mode = rand() % 3;
0395 
0396             sqe = io_uring_get_sqe(&ring);
0397 
0398             if (mode == MODE_NONZC) {
0399                 io_uring_prep_send(sqe, fd, payload,
0400                            cfg_payload_len, msg_flags);
0401                 sqe->user_data = NONZC_TAG;
0402             } else {
0403                 compl_cqes++;
0404                 io_uring_prep_sendzc(sqe, fd, payload,
0405                              cfg_payload_len,
0406                              msg_flags, zc_flags);
0407                 if (mode == MODE_ZC_FIXED) {
0408                     sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
0409                     sqe->buf_index = buf_idx;
0410                 }
0411                 sqe->user_data = ZC_TAG;
0412             }
0413         }
0414 
0415         ret = io_uring_submit(&ring);
0416         if (ret != cfg_nr_reqs)
0417             error(1, ret, "submit");
0418 
0419         if (cfg_cork)
0420             do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
0421         for (i = 0; i < cfg_nr_reqs; i++) {
0422             ret = io_uring_wait_cqe(&ring, &cqe);
0423             if (ret)
0424                 error(1, ret, "wait cqe");
0425 
0426             if (cqe->user_data != NONZC_TAG &&
0427                 cqe->user_data != ZC_TAG)
0428                 error(1, -EINVAL, "invalid cqe->user_data");
0429 
0430             if (cqe->flags & IORING_CQE_F_NOTIF) {
0431                 if (cqe->flags & IORING_CQE_F_MORE)
0432                     error(1, -EINVAL, "invalid notif flags");
0433                 compl_cqes--;
0434                 i--;
0435             } else if (cqe->res <= 0) {
0436                 if (cqe->flags & IORING_CQE_F_MORE)
0437                     error(1, cqe->res, "more with a failed send");
0438                 error(1, cqe->res, "send failed");
0439             } else {
0440                 if (cqe->user_data == ZC_TAG &&
0441                     !(cqe->flags & IORING_CQE_F_MORE))
0442                     error(1, cqe->res, "missing more flag");
0443                 packets++;
0444                 bytes += cqe->res;
0445             }
0446             io_uring_cqe_seen(&ring);
0447         }
0448     } while (gettimeofday_ms() < tstop);
0449 
0450     while (compl_cqes) {
0451         ret = io_uring_wait_cqe(&ring, &cqe);
0452         if (ret)
0453             error(1, ret, "wait cqe");
0454         if (cqe->flags & IORING_CQE_F_MORE)
0455             error(1, -EINVAL, "invalid notif flags");
0456         if (!(cqe->flags & IORING_CQE_F_NOTIF))
0457             error(1, -EINVAL, "missing notif flag");
0458 
0459         io_uring_cqe_seen(&ring);
0460         compl_cqes--;
0461     }
0462 
0463     fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
0464             packets, bytes >> 20,
0465             packets / (cfg_runtime_ms / 1000),
0466             (bytes >> 20) / (cfg_runtime_ms / 1000));
0467 
0468     if (close(fd))
0469         error(1, errno, "close");
0470 }
0471 
0472 static void do_test(int domain, int type, int protocol)
0473 {
0474     int i;
0475 
0476     for (i = 0; i < IP_MAXPACKET; i++)
0477         payload[i] = 'a' + (i % 26);
0478     do_tx(domain, type, protocol);
0479 }
0480 
0481 static void usage(const char *filepath)
0482 {
0483     error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
0484             "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
0485 }
0486 
0487 static void parse_opts(int argc, char **argv)
0488 {
0489     const int max_payload_len = sizeof(payload) -
0490                     sizeof(struct ipv6hdr) -
0491                     sizeof(struct tcphdr) -
0492                     40 /* max tcp options */;
0493     struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
0494     struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
0495     char *daddr = NULL;
0496     int c;
0497 
0498     if (argc <= 1)
0499         usage(argv[0]);
0500     cfg_payload_len = max_payload_len;
0501 
0502     while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
0503         switch (c) {
0504         case '4':
0505             if (cfg_family != PF_UNSPEC)
0506                 error(1, 0, "Pass one of -4 or -6");
0507             cfg_family = PF_INET;
0508             cfg_alen = sizeof(struct sockaddr_in);
0509             break;
0510         case '6':
0511             if (cfg_family != PF_UNSPEC)
0512                 error(1, 0, "Pass one of -4 or -6");
0513             cfg_family = PF_INET6;
0514             cfg_alen = sizeof(struct sockaddr_in6);
0515             break;
0516         case 'D':
0517             daddr = optarg;
0518             break;
0519         case 'p':
0520             cfg_port = strtoul(optarg, NULL, 0);
0521             break;
0522         case 's':
0523             cfg_payload_len = strtoul(optarg, NULL, 0);
0524             break;
0525         case 't':
0526             cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
0527             break;
0528         case 'n':
0529             cfg_nr_reqs = strtoul(optarg, NULL, 0);
0530             break;
0531         case 'c':
0532             cfg_cork = strtol(optarg, NULL, 0);
0533             break;
0534         case 'm':
0535             cfg_mode = strtol(optarg, NULL, 0);
0536             break;
0537         }
0538     }
0539 
0540     switch (cfg_family) {
0541     case PF_INET:
0542         memset(addr4, 0, sizeof(*addr4));
0543         addr4->sin_family = AF_INET;
0544         addr4->sin_port = htons(cfg_port);
0545         if (daddr &&
0546             inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
0547             error(1, 0, "ipv4 parse error: %s", daddr);
0548         break;
0549     case PF_INET6:
0550         memset(addr6, 0, sizeof(*addr6));
0551         addr6->sin6_family = AF_INET6;
0552         addr6->sin6_port = htons(cfg_port);
0553         if (daddr &&
0554             inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
0555             error(1, 0, "ipv6 parse error: %s", daddr);
0556         break;
0557     default:
0558         error(1, 0, "illegal domain");
0559     }
0560 
0561     if (cfg_payload_len > max_payload_len)
0562         error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
0563     if (optind != argc - 1)
0564         usage(argv[0]);
0565 }
0566 
0567 int main(int argc, char **argv)
0568 {
0569     const char *cfg_test = argv[argc - 1];
0570 
0571     parse_opts(argc, argv);
0572 
0573     if (!strcmp(cfg_test, "tcp"))
0574         do_test(cfg_family, SOCK_STREAM, 0);
0575     else if (!strcmp(cfg_test, "udp"))
0576         do_test(cfg_family, SOCK_DGRAM, 0);
0577     else
0578         error(1, 0, "unknown cfg_test %s", cfg_test);
0579     return 0;
0580 }