Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #ifdef HAVE_EVENTFD_SUPPORT
0003 /*
0004  * Copyright (C) 2018 Davidlohr Bueso.
0005  *
0006  * This program benchmarks concurrent epoll_wait(2) monitoring multiple
0007  * file descriptors under one or two load balancing models. The first,
0008  * and default, is the single/combined queueing (which refers to a single
0009  * epoll instance for N worker threads):
0010  *
0011  *                          |---> [worker A]
0012  *                          |---> [worker B]
0013  *        [combined queue]  .---> [worker C]
0014  *                          |---> [worker D]
0015  *                          |---> [worker E]
0016  *
0017  * While the second model, enabled via --multiq option, uses multiple
0018  * queueing (which refers to one epoll instance per worker). For example,
0019  * short lived tcp connections in a high throughput httpd server will
0020  * distribute the accept()'ing  connections across CPUs. In this case each
0021  * worker does a limited  amount of processing.
0022  *
0023  *             [queue A]  ---> [worker]
0024  *             [queue B]  ---> [worker]
0025  *             [queue C]  ---> [worker]
0026  *             [queue D]  ---> [worker]
0027  *             [queue E]  ---> [worker]
0028  *
0029  * Naturally, the single queue will enforce more concurrency on the epoll
0030  * instance, and can therefore scale poorly compared to multiple queues.
0031  * However, this is a benchmark raw data and must be taken with a grain of
0032  * salt when choosing how to make use of sys_epoll.
0033 
0034  * Each thread has a number of private, nonblocking file descriptors,
0035  * referred to as fdmap. A writer thread will constantly be writing to
0036  * the fdmaps of all threads, minimizing each threads's chances of
0037  * epoll_wait not finding any ready read events and blocking as this
0038  * is not what we want to stress. The size of the fdmap can be adjusted
0039  * by the user; enlarging the value will increase the chances of
0040  * epoll_wait(2) blocking as the lineal writer thread will take "longer",
0041  * at least at a high level.
0042  *
0043  * Note that because fds are private to each thread, this workload does
0044  * not stress scenarios where multiple tasks are awoken per ready IO; ie:
0045  * EPOLLEXCLUSIVE semantics.
0046  *
0047  * The end result/metric is throughput: number of ops/second where an
0048  * operation consists of:
0049  *
0050  *   epoll_wait(2) + [others]
0051  *
0052  *        ... where [others] is the cost of re-adding the fd (EPOLLET),
0053  *            or rearming it (EPOLLONESHOT).
0054  *
0055  *
0056  * The purpose of this is program is that it be useful for measuring
0057  * kernel related changes to the sys_epoll, and not comparing different
0058  * IO polling methods, for example. Hence everything is very adhoc and
0059  * outputs raw microbenchmark numbers. Also this uses eventfd, similar
0060  * tools tend to use pipes or sockets, but the result is the same.
0061  */
0062 
0063 /* For the CLR_() macros */
0064 #include <string.h>
0065 #include <pthread.h>
0066 #include <unistd.h>
0067 
0068 #include <errno.h>
0069 #include <inttypes.h>
0070 #include <signal.h>
0071 #include <stdlib.h>
0072 #include <linux/compiler.h>
0073 #include <linux/kernel.h>
0074 #include <sys/time.h>
0075 #include <sys/resource.h>
0076 #include <sys/epoll.h>
0077 #include <sys/eventfd.h>
0078 #include <sys/types.h>
0079 #include <perf/cpumap.h>
0080 
0081 #include "../util/stat.h"
0082 #include <subcmd/parse-options.h>
0083 #include "bench.h"
0084 
0085 #include <err.h>
0086 
0087 #define printinfo(fmt, arg...) \
0088     do { if (__verbose) { printf(fmt, ## arg); fflush(stdout); } } while (0)
0089 
0090 static unsigned int nthreads = 0;
0091 static unsigned int nsecs    = 8;
0092 static bool wdone, done, __verbose, randomize, nonblocking;
0093 
0094 /*
0095  * epoll related shared variables.
0096  */
0097 
0098 /* Maximum number of nesting allowed inside epoll sets */
0099 #define EPOLL_MAXNESTS 4
0100 
0101 static int epollfd;
0102 static int *epollfdp;
0103 static bool noaffinity;
0104 static unsigned int nested = 0;
0105 static bool et; /* edge-trigger */
0106 static bool oneshot;
0107 static bool multiq; /* use an epoll instance per thread */
0108 
0109 /* amount of fds to monitor, per thread */
0110 static unsigned int nfds = 64;
0111 
0112 static pthread_mutex_t thread_lock;
0113 static unsigned int threads_starting;
0114 static struct stats throughput_stats;
0115 static pthread_cond_t thread_parent, thread_worker;
0116 
0117 struct worker {
0118     int tid;
0119     int epollfd; /* for --multiq */
0120     pthread_t thread;
0121     unsigned long ops;
0122     int *fdmap;
0123 };
0124 
0125 static const struct option options[] = {
0126     /* general benchmark options */
0127     OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
0128     OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"),
0129     OPT_UINTEGER('f', "nfds",    &nfds,  "Specify amount of file descriptors to monitor for each thread"),
0130     OPT_BOOLEAN( 'n', "noaffinity",  &noaffinity,   "Disables CPU affinity"),
0131     OPT_BOOLEAN('R', "randomize", &randomize,   "Enable random write behaviour (default is lineal)"),
0132     OPT_BOOLEAN( 'v', "verbose", &__verbose, "Verbose mode"),
0133 
0134     /* epoll specific options */
0135     OPT_BOOLEAN( 'm', "multiq",  &multiq,   "Use multiple epoll instances (one per thread)"),
0136     OPT_BOOLEAN( 'B', "nonblocking", &nonblocking, "Nonblocking epoll_wait(2) behaviour"),
0137     OPT_UINTEGER( 'N', "nested",  &nested,   "Nesting level epoll hierarchy (default is 0, no nesting)"),
0138     OPT_BOOLEAN( 'S', "oneshot",  &oneshot,   "Use EPOLLONESHOT semantics"),
0139     OPT_BOOLEAN( 'E', "edge",  &et,   "Use Edge-triggered interface (default is LT)"),
0140 
0141     OPT_END()
0142 };
0143 
0144 static const char * const bench_epoll_wait_usage[] = {
0145     "perf bench epoll wait <options>",
0146     NULL
0147 };
0148 
0149 
0150 /*
0151  * Arrange the N elements of ARRAY in random order.
0152  * Only effective if N is much smaller than RAND_MAX;
0153  * if this may not be the case, use a better random
0154  * number generator. -- Ben Pfaff.
0155  */
0156 static void shuffle(void *array, size_t n, size_t size)
0157 {
0158     char *carray = array;
0159     void *aux;
0160     size_t i;
0161 
0162     if (n <= 1)
0163         return;
0164 
0165     aux = calloc(1, size);
0166     if (!aux)
0167         err(EXIT_FAILURE, "calloc");
0168 
0169     for (i = 1; i < n; ++i) {
0170         size_t j =   i + rand() / (RAND_MAX / (n - i) + 1);
0171         j *= size;
0172 
0173         memcpy(aux, &carray[j], size);
0174         memcpy(&carray[j], &carray[i*size], size);
0175         memcpy(&carray[i*size], aux, size);
0176     }
0177 
0178     free(aux);
0179 }
0180 
0181 
0182 static void *workerfn(void *arg)
0183 {
0184     int fd, ret, r;
0185     struct worker *w = (struct worker *) arg;
0186     unsigned long ops = w->ops;
0187     struct epoll_event ev;
0188     uint64_t val;
0189     int to = nonblocking? 0 : -1;
0190     int efd = multiq ? w->epollfd : epollfd;
0191 
0192     pthread_mutex_lock(&thread_lock);
0193     threads_starting--;
0194     if (!threads_starting)
0195         pthread_cond_signal(&thread_parent);
0196     pthread_cond_wait(&thread_worker, &thread_lock);
0197     pthread_mutex_unlock(&thread_lock);
0198 
0199     do {
0200         /*
0201          * Block indefinitely waiting for the IN event.
0202          * In order to stress the epoll_wait(2) syscall,
0203          * call it event per event, instead of a larger
0204          * batch (max)limit.
0205          */
0206         do {
0207             ret = epoll_wait(efd, &ev, 1, to);
0208         } while (ret < 0 && errno == EINTR);
0209         if (ret < 0)
0210             err(EXIT_FAILURE, "epoll_wait");
0211 
0212         fd = ev.data.fd;
0213 
0214         do {
0215             r = read(fd, &val, sizeof(val));
0216         } while (!done && (r < 0 && errno == EAGAIN));
0217 
0218         if (et) {
0219             ev.events = EPOLLIN | EPOLLET;
0220             ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev);
0221         }
0222 
0223         if (oneshot) {
0224             /* rearm the file descriptor with a new event mask */
0225             ev.events |= EPOLLIN | EPOLLONESHOT;
0226             ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev);
0227         }
0228 
0229         ops++;
0230     }  while (!done);
0231 
0232     if (multiq)
0233         close(w->epollfd);
0234 
0235     w->ops = ops;
0236     return NULL;
0237 }
0238 
0239 static void nest_epollfd(struct worker *w)
0240 {
0241     unsigned int i;
0242     struct epoll_event ev;
0243     int efd = multiq ? w->epollfd : epollfd;
0244 
0245     if (nested > EPOLL_MAXNESTS)
0246         nested = EPOLL_MAXNESTS;
0247 
0248     epollfdp = calloc(nested, sizeof(*epollfdp));
0249     if (!epollfdp)
0250         err(EXIT_FAILURE, "calloc");
0251 
0252     for (i = 0; i < nested; i++) {
0253         epollfdp[i] = epoll_create(1);
0254         if (epollfdp[i] < 0)
0255             err(EXIT_FAILURE, "epoll_create");
0256     }
0257 
0258     ev.events = EPOLLHUP; /* anything */
0259     ev.data.u64 = i; /* any number */
0260 
0261     for (i = nested - 1; i; i--) {
0262         if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD,
0263                   epollfdp[i], &ev) < 0)
0264             err(EXIT_FAILURE, "epoll_ctl");
0265     }
0266 
0267     if (epoll_ctl(efd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0)
0268         err(EXIT_FAILURE, "epoll_ctl");
0269 }
0270 
0271 static void toggle_done(int sig __maybe_unused,
0272             siginfo_t *info __maybe_unused,
0273             void *uc __maybe_unused)
0274 {
0275     /* inform all threads that we're done for the day */
0276     done = true;
0277     gettimeofday(&bench__end, NULL);
0278     timersub(&bench__end, &bench__start, &bench__runtime);
0279 }
0280 
0281 static void print_summary(void)
0282 {
0283     unsigned long avg = avg_stats(&throughput_stats);
0284     double stddev = stddev_stats(&throughput_stats);
0285 
0286     printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
0287            avg, rel_stddev_stats(stddev, avg),
0288            (int)bench__runtime.tv_sec);
0289 }
0290 
0291 static int do_threads(struct worker *worker, struct perf_cpu_map *cpu)
0292 {
0293     pthread_attr_t thread_attr, *attrp = NULL;
0294     cpu_set_t *cpuset;
0295     unsigned int i, j;
0296     int ret = 0, events = EPOLLIN;
0297     int nrcpus;
0298     size_t size;
0299 
0300     if (oneshot)
0301         events |= EPOLLONESHOT;
0302     if (et)
0303         events |= EPOLLET;
0304 
0305     printinfo("starting worker/consumer %sthreads%s\n",
0306           noaffinity ?  "":"CPU affinity ",
0307           nonblocking ? " (nonblocking)":"");
0308     if (!noaffinity)
0309         pthread_attr_init(&thread_attr);
0310 
0311     nrcpus = perf_cpu_map__nr(cpu);
0312     cpuset = CPU_ALLOC(nrcpus);
0313     BUG_ON(!cpuset);
0314     size = CPU_ALLOC_SIZE(nrcpus);
0315 
0316     for (i = 0; i < nthreads; i++) {
0317         struct worker *w = &worker[i];
0318 
0319         if (multiq) {
0320             w->epollfd = epoll_create(1);
0321             if (w->epollfd < 0)
0322                 err(EXIT_FAILURE, "epoll_create");
0323 
0324             if (nested)
0325                 nest_epollfd(w);
0326         }
0327 
0328         w->tid = i;
0329         w->fdmap = calloc(nfds, sizeof(int));
0330         if (!w->fdmap)
0331             return 1;
0332 
0333         for (j = 0; j < nfds; j++) {
0334             int efd = multiq ? w->epollfd : epollfd;
0335             struct epoll_event ev;
0336 
0337             w->fdmap[j] = eventfd(0, EFD_NONBLOCK);
0338             if (w->fdmap[j] < 0)
0339                 err(EXIT_FAILURE, "eventfd");
0340 
0341             ev.data.fd = w->fdmap[j];
0342             ev.events = events;
0343 
0344             ret = epoll_ctl(efd, EPOLL_CTL_ADD,
0345                     w->fdmap[j], &ev);
0346             if (ret < 0)
0347                 err(EXIT_FAILURE, "epoll_ctl");
0348         }
0349 
0350         if (!noaffinity) {
0351             CPU_ZERO_S(size, cpuset);
0352             CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu,
0353                     size, cpuset);
0354 
0355             ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset);
0356             if (ret) {
0357                 CPU_FREE(cpuset);
0358                 err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
0359             }
0360 
0361             attrp = &thread_attr;
0362         }
0363 
0364         ret = pthread_create(&w->thread, attrp, workerfn,
0365                      (void *)(struct worker *) w);
0366         if (ret) {
0367             CPU_FREE(cpuset);
0368             err(EXIT_FAILURE, "pthread_create");
0369         }
0370     }
0371 
0372     CPU_FREE(cpuset);
0373     if (!noaffinity)
0374         pthread_attr_destroy(&thread_attr);
0375 
0376     return ret;
0377 }
0378 
0379 static void *writerfn(void *p)
0380 {
0381     struct worker *worker = p;
0382     size_t i, j, iter;
0383     const uint64_t val = 1;
0384     ssize_t sz;
0385     struct timespec ts = { .tv_sec = 0,
0386                    .tv_nsec = 500 };
0387 
0388     printinfo("starting writer-thread: doing %s writes ...\n",
0389           randomize? "random":"lineal");
0390 
0391     for (iter = 0; !wdone; iter++) {
0392         if (randomize) {
0393             shuffle((void *)worker, nthreads, sizeof(*worker));
0394         }
0395 
0396         for (i = 0; i < nthreads; i++) {
0397             struct worker *w = &worker[i];
0398 
0399             if (randomize) {
0400                 shuffle((void *)w->fdmap, nfds, sizeof(int));
0401             }
0402 
0403             for (j = 0; j < nfds; j++) {
0404                 do {
0405                     sz = write(w->fdmap[j], &val, sizeof(val));
0406                 } while (!wdone && (sz < 0 && errno == EAGAIN));
0407             }
0408         }
0409 
0410         nanosleep(&ts, NULL);
0411     }
0412 
0413     printinfo("exiting writer-thread (total full-loops: %zd)\n", iter);
0414     return NULL;
0415 }
0416 
0417 static int cmpworker(const void *p1, const void *p2)
0418 {
0419 
0420     struct worker *w1 = (struct worker *) p1;
0421     struct worker *w2 = (struct worker *) p2;
0422     return w1->tid > w2->tid;
0423 }
0424 
0425 int bench_epoll_wait(int argc, const char **argv)
0426 {
0427     int ret = 0;
0428     struct sigaction act;
0429     unsigned int i;
0430     struct worker *worker = NULL;
0431     struct perf_cpu_map *cpu;
0432     pthread_t wthread;
0433     struct rlimit rl, prevrl;
0434 
0435     argc = parse_options(argc, argv, options, bench_epoll_wait_usage, 0);
0436     if (argc) {
0437         usage_with_options(bench_epoll_wait_usage, options);
0438         exit(EXIT_FAILURE);
0439     }
0440 
0441     memset(&act, 0, sizeof(act));
0442     sigfillset(&act.sa_mask);
0443     act.sa_sigaction = toggle_done;
0444     sigaction(SIGINT, &act, NULL);
0445 
0446     cpu = perf_cpu_map__new(NULL);
0447     if (!cpu)
0448         goto errmem;
0449 
0450     /* a single, main epoll instance */
0451     if (!multiq) {
0452         epollfd = epoll_create(1);
0453         if (epollfd < 0)
0454             err(EXIT_FAILURE, "epoll_create");
0455 
0456         /*
0457          * Deal with nested epolls, if any.
0458          */
0459         if (nested)
0460             nest_epollfd(NULL);
0461     }
0462 
0463     printinfo("Using %s queue model\n", multiq ? "multi" : "single");
0464     printinfo("Nesting level(s): %d\n", nested);
0465 
0466     /* default to the number of CPUs and leave one for the writer pthread */
0467     if (!nthreads)
0468         nthreads = perf_cpu_map__nr(cpu) - 1;
0469 
0470     worker = calloc(nthreads, sizeof(*worker));
0471     if (!worker) {
0472         goto errmem;
0473     }
0474 
0475     if (getrlimit(RLIMIT_NOFILE, &prevrl))
0476         err(EXIT_FAILURE, "getrlimit");
0477     rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
0478     printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n",
0479           (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max);
0480     if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
0481         err(EXIT_FAILURE, "setrlimit");
0482 
0483     printf("Run summary [PID %d]: %d threads monitoring%s on "
0484            "%d file-descriptors for %d secs.\n\n",
0485            getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs);
0486 
0487     init_stats(&throughput_stats);
0488     pthread_mutex_init(&thread_lock, NULL);
0489     pthread_cond_init(&thread_parent, NULL);
0490     pthread_cond_init(&thread_worker, NULL);
0491 
0492     threads_starting = nthreads;
0493 
0494     gettimeofday(&bench__start, NULL);
0495 
0496     do_threads(worker, cpu);
0497 
0498     pthread_mutex_lock(&thread_lock);
0499     while (threads_starting)
0500         pthread_cond_wait(&thread_parent, &thread_lock);
0501     pthread_cond_broadcast(&thread_worker);
0502     pthread_mutex_unlock(&thread_lock);
0503 
0504     /*
0505      * At this point the workers should be blocked waiting for read events
0506      * to become ready. Launch the writer which will constantly be writing
0507      * to each thread's fdmap.
0508      */
0509     ret = pthread_create(&wthread, NULL, writerfn,
0510                  (void *)(struct worker *) worker);
0511     if (ret)
0512         err(EXIT_FAILURE, "pthread_create");
0513 
0514     sleep(nsecs);
0515     toggle_done(0, NULL, NULL);
0516     printinfo("main thread: toggling done\n");
0517 
0518     sleep(1); /* meh */
0519     wdone = true;
0520     ret = pthread_join(wthread, NULL);
0521     if (ret)
0522         err(EXIT_FAILURE, "pthread_join");
0523 
0524     /* cleanup & report results */
0525     pthread_cond_destroy(&thread_parent);
0526     pthread_cond_destroy(&thread_worker);
0527     pthread_mutex_destroy(&thread_lock);
0528 
0529     /* sort the array back before reporting */
0530     if (randomize)
0531         qsort(worker, nthreads, sizeof(struct worker), cmpworker);
0532 
0533     for (i = 0; i < nthreads; i++) {
0534         unsigned long t = bench__runtime.tv_sec > 0 ?
0535             worker[i].ops / bench__runtime.tv_sec : 0;
0536 
0537         update_stats(&throughput_stats, t);
0538 
0539         if (nfds == 1)
0540             printf("[thread %2d] fdmap: %p [ %04ld ops/sec ]\n",
0541                    worker[i].tid, &worker[i].fdmap[0], t);
0542         else
0543             printf("[thread %2d] fdmap: %p ... %p [ %04ld ops/sec ]\n",
0544                    worker[i].tid, &worker[i].fdmap[0],
0545                    &worker[i].fdmap[nfds-1], t);
0546     }
0547 
0548     print_summary();
0549 
0550     close(epollfd);
0551     return ret;
0552 errmem:
0553     err(EXIT_FAILURE, "calloc");
0554 }
0555 #endif // HAVE_EVENTFD_SUPPORT