Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /* Copyright (c) 2019 Facebook
0003  *
0004  * This program is free software; you can redistribute it and/or
0005  * modify it under the terms of version 2 of the GNU General Public
0006  * License as published by the Free Software Foundation.
0007  *
0008  * Example program for Host Bandwidth Managment
0009  *
0010  * This program loads a cgroup skb BPF program to enforce cgroup output
0011  * (egress) or input (ingress) bandwidth limits.
0012  *
0013  * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
0014  *   Where:
0015  *    -d    Print BPF trace debug buffer
0016  *    -l    Also limit flows doing loopback
0017  *    -n <#>    To create cgroup \"/hbm#\" and attach prog
0018  *      Default is /hbm1
0019  *    --no_cn   Do not return cn notifications
0020  *    -r <rate> Rate limit in Mbps
0021  *    -s    Get HBM stats (marked, dropped, etc.)
0022  *    -t <time> Exit after specified seconds (default is 0)
0023  *    -w    Work conserving flag. cgroup can increase its bandwidth
0024  *      beyond the rate limit specified while there is available
0025  *      bandwidth. Current implementation assumes there is only
0026  *      NIC (eth0), but can be extended to support multiple NICs.
0027  *      Currrently only supported for egress.
0028  *    -h    Print this info
0029  *    prog  BPF program file name. Name defaults to hbm_out_kern.o
0030  */
0031 
0032 #define _GNU_SOURCE
0033 
0034 #include <stdio.h>
0035 #include <stdlib.h>
0036 #include <assert.h>
0037 #include <sys/time.h>
0038 #include <unistd.h>
0039 #include <errno.h>
0040 #include <fcntl.h>
0041 #include <linux/unistd.h>
0042 #include <linux/compiler.h>
0043 
0044 #include <linux/bpf.h>
0045 #include <bpf/bpf.h>
0046 #include <getopt.h>
0047 
0048 #include "cgroup_helpers.h"
0049 #include "hbm.h"
0050 #include "bpf_util.h"
0051 #include <bpf/libbpf.h>
0052 
0053 bool outFlag = true;
0054 int minRate = 1000;     /* cgroup rate limit in Mbps */
0055 int rate = 1000;        /* can grow if rate conserving is enabled */
0056 int dur = 1;
0057 bool stats_flag;
0058 bool loopback_flag;
0059 bool debugFlag;
0060 bool work_conserving_flag;
0061 bool no_cn_flag;
0062 bool edt_flag;
0063 
0064 static void Usage(void);
0065 static void read_trace_pipe2(void);
0066 static void do_error(char *msg, bool errno_flag);
0067 
0068 #define DEBUGFS "/sys/kernel/debug/tracing/"
0069 
0070 static struct bpf_program *bpf_prog;
0071 static struct bpf_object *obj;
0072 static int queue_stats_fd;
0073 
0074 static void read_trace_pipe2(void)
0075 {
0076     int trace_fd;
0077     FILE *outf;
0078     char *outFname = "hbm_out.log";
0079 
0080     trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
0081     if (trace_fd < 0) {
0082         printf("Error opening trace_pipe\n");
0083         return;
0084     }
0085 
0086 //  Future support of ingress
0087 //  if (!outFlag)
0088 //      outFname = "hbm_in.log";
0089     outf = fopen(outFname, "w");
0090 
0091     if (outf == NULL)
0092         printf("Error creating %s\n", outFname);
0093 
0094     while (1) {
0095         static char buf[4097];
0096         ssize_t sz;
0097 
0098         sz = read(trace_fd, buf, sizeof(buf) - 1);
0099         if (sz > 0) {
0100             buf[sz] = 0;
0101             puts(buf);
0102             if (outf != NULL) {
0103                 fprintf(outf, "%s\n", buf);
0104                 fflush(outf);
0105             }
0106         }
0107     }
0108 }
0109 
0110 static void do_error(char *msg, bool errno_flag)
0111 {
0112     if (errno_flag)
0113         printf("ERROR: %s, errno: %d\n", msg, errno);
0114     else
0115         printf("ERROR: %s\n", msg);
0116     exit(1);
0117 }
0118 
0119 static int prog_load(char *prog)
0120 {
0121     struct bpf_program *pos;
0122     const char *sec_name;
0123 
0124     obj = bpf_object__open_file(prog, NULL);
0125     if (libbpf_get_error(obj)) {
0126         printf("ERROR: opening BPF object file failed\n");
0127         return 1;
0128     }
0129 
0130     /* load BPF program */
0131     if (bpf_object__load(obj)) {
0132         printf("ERROR: loading BPF object file failed\n");
0133         goto err;
0134     }
0135 
0136     bpf_object__for_each_program(pos, obj) {
0137         sec_name = bpf_program__section_name(pos);
0138         if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) {
0139             bpf_prog = pos;
0140             break;
0141         }
0142     }
0143     if (!bpf_prog) {
0144         printf("ERROR: finding a prog in obj file failed\n");
0145         goto err;
0146     }
0147 
0148     queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats");
0149     if (queue_stats_fd < 0) {
0150         printf("ERROR: finding a map in obj file failed\n");
0151         goto err;
0152     }
0153 
0154     return 0;
0155 
0156 err:
0157     bpf_object__close(obj);
0158     return 1;
0159 }
0160 
0161 static int run_bpf_prog(char *prog, int cg_id)
0162 {
0163     struct hbm_queue_stats qstats = {0};
0164     char cg_dir[100], cg_pin_path[100];
0165     struct bpf_link *link = NULL;
0166     int key = 0;
0167     int cg1 = 0;
0168     int rc = 0;
0169 
0170     sprintf(cg_dir, "/hbm%d", cg_id);
0171     rc = prog_load(prog);
0172     if (rc != 0)
0173         return rc;
0174 
0175     if (setup_cgroup_environment()) {
0176         printf("ERROR: setting cgroup environment\n");
0177         goto err;
0178     }
0179     cg1 = create_and_get_cgroup(cg_dir);
0180     if (!cg1) {
0181         printf("ERROR: create_and_get_cgroup\n");
0182         goto err;
0183     }
0184     if (join_cgroup(cg_dir)) {
0185         printf("ERROR: join_cgroup\n");
0186         goto err;
0187     }
0188 
0189     qstats.rate = rate;
0190     qstats.stats = stats_flag ? 1 : 0;
0191     qstats.loopback = loopback_flag ? 1 : 0;
0192     qstats.no_cn = no_cn_flag ? 1 : 0;
0193     if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) {
0194         printf("ERROR: Could not update map element\n");
0195         goto err;
0196     }
0197 
0198     if (!outFlag)
0199         bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS);
0200 
0201     link = bpf_program__attach_cgroup(bpf_prog, cg1);
0202     if (libbpf_get_error(link)) {
0203         fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n");
0204         goto err;
0205     }
0206 
0207     sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id);
0208     rc = bpf_link__pin(link, cg_pin_path);
0209     if (rc < 0) {
0210         printf("ERROR: bpf_link__pin failed: %d\n", rc);
0211         goto err;
0212     }
0213 
0214     if (work_conserving_flag) {
0215         struct timeval t0, t_last, t_new;
0216         FILE *fin;
0217         unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
0218         signed long long last_cg_tx_bytes, new_cg_tx_bytes;
0219         signed long long delta_time, delta_bytes, delta_rate;
0220         int delta_ms;
0221 #define DELTA_RATE_CHECK 10000      /* in us */
0222 #define RATE_THRESHOLD 9500000000   /* 9.5 Gbps */
0223 
0224         bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
0225         if (gettimeofday(&t0, NULL) < 0)
0226             do_error("gettimeofday failed", true);
0227         t_last = t0;
0228         fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
0229         if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
0230             do_error("fscanf fails", false);
0231         fclose(fin);
0232         last_cg_tx_bytes = qstats.bytes_total;
0233         while (true) {
0234             usleep(DELTA_RATE_CHECK);
0235             if (gettimeofday(&t_new, NULL) < 0)
0236                 do_error("gettimeofday failed", true);
0237             delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
0238                 (t_new.tv_usec - t0.tv_usec)/1000;
0239             if (delta_ms > dur * 1000)
0240                 break;
0241             delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
0242                 (t_new.tv_usec - t_last.tv_usec);
0243             if (delta_time == 0)
0244                 continue;
0245             t_last = t_new;
0246             fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
0247                     "r");
0248             if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
0249                 do_error("fscanf fails", false);
0250             fclose(fin);
0251             printf("  new_eth_tx_bytes:%llu\n",
0252                    new_eth_tx_bytes);
0253             bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
0254             new_cg_tx_bytes = qstats.bytes_total;
0255             delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
0256             last_eth_tx_bytes = new_eth_tx_bytes;
0257             delta_rate = (delta_bytes * 8000000) / delta_time;
0258             printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
0259                    delta_ms, delta_rate/1000000000.0,
0260                    rate/1000.0);
0261             if (delta_rate < RATE_THRESHOLD) {
0262                 /* can increase cgroup rate limit, but first
0263                  * check if we are using the current limit.
0264                  * Currently increasing by 6.25%, unknown
0265                  * if that is the optimal rate.
0266                  */
0267                 int rate_diff100;
0268 
0269                 delta_bytes = new_cg_tx_bytes -
0270                     last_cg_tx_bytes;
0271                 last_cg_tx_bytes = new_cg_tx_bytes;
0272                 delta_rate = (delta_bytes * 8000000) /
0273                     delta_time;
0274                 printf(" rate:%.3fGbps",
0275                        delta_rate/1000000000.0);
0276                 rate_diff100 = (((long long)rate)*1000000 -
0277                              delta_rate) * 100 /
0278                     (((long long) rate) * 1000000);
0279                 printf("  rdiff:%d", rate_diff100);
0280                 if (rate_diff100  <= 3) {
0281                     rate += (rate >> 4);
0282                     if (rate > RATE_THRESHOLD / 1000000)
0283                         rate = RATE_THRESHOLD / 1000000;
0284                     qstats.rate = rate;
0285                     printf(" INC\n");
0286                 } else {
0287                     printf("\n");
0288                 }
0289             } else {
0290                 /* Need to decrease cgroup rate limit.
0291                  * Currently decreasing by 12.5%, unknown
0292                  * if that is optimal
0293                  */
0294                 printf(" DEC\n");
0295                 rate -= (rate >> 3);
0296                 if (rate < minRate)
0297                     rate = minRate;
0298                 qstats.rate = rate;
0299             }
0300             if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY))
0301                 do_error("update map element fails", false);
0302         }
0303     } else {
0304         sleep(dur);
0305     }
0306     // Get stats!
0307     if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) {
0308         char fname[100];
0309         FILE *fout;
0310 
0311         if (!outFlag)
0312             sprintf(fname, "hbm.%d.in", cg_id);
0313         else
0314             sprintf(fname, "hbm.%d.out", cg_id);
0315         fout = fopen(fname, "w");
0316         fprintf(fout, "id:%d\n", cg_id);
0317         fprintf(fout, "ERROR: Could not lookup queue_stats\n");
0318     } else if (stats_flag && qstats.lastPacketTime >
0319            qstats.firstPacketTime) {
0320         long long delta_us = (qstats.lastPacketTime -
0321                       qstats.firstPacketTime)/1000;
0322         unsigned int rate_mbps = ((qstats.bytes_total -
0323                        qstats.bytes_dropped) * 8 /
0324                       delta_us);
0325         double percent_pkts, percent_bytes;
0326         char fname[100];
0327         FILE *fout;
0328         int k;
0329         static const char *returnValNames[] = {
0330             "DROP_PKT",
0331             "ALLOW_PKT",
0332             "DROP_PKT_CWR",
0333             "ALLOW_PKT_CWR"
0334         };
0335 #define RET_VAL_COUNT 4
0336 
0337 // Future support of ingress
0338 //      if (!outFlag)
0339 //          sprintf(fname, "hbm.%d.in", cg_id);
0340 //      else
0341         sprintf(fname, "hbm.%d.out", cg_id);
0342         fout = fopen(fname, "w");
0343         fprintf(fout, "id:%d\n", cg_id);
0344         fprintf(fout, "rate_mbps:%d\n", rate_mbps);
0345         fprintf(fout, "duration:%.1f secs\n",
0346             (qstats.lastPacketTime - qstats.firstPacketTime) /
0347             1000000000.0);
0348         fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
0349         fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
0350                              1000000));
0351         fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
0352         fprintf(fout, "bytes_dropped_MB:%d\n",
0353             (int)(qstats.bytes_dropped /
0354                                1000000));
0355         // Marked Pkts and Bytes
0356         percent_pkts = (qstats.pkts_marked * 100.0) /
0357             (qstats.pkts_total + 1);
0358         percent_bytes = (qstats.bytes_marked * 100.0) /
0359             (qstats.bytes_total + 1);
0360         fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
0361         fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
0362 
0363         // Dropped Pkts and Bytes
0364         percent_pkts = (qstats.pkts_dropped * 100.0) /
0365             (qstats.pkts_total + 1);
0366         percent_bytes = (qstats.bytes_dropped * 100.0) /
0367             (qstats.bytes_total + 1);
0368         fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
0369         fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
0370 
0371         // ECN CE markings
0372         percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
0373             (qstats.pkts_total + 1);
0374         fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
0375             (int)qstats.pkts_ecn_ce);
0376 
0377         // Average cwnd
0378         fprintf(fout, "avg cwnd:%d\n",
0379             (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
0380         // Average rtt
0381         fprintf(fout, "avg rtt:%d\n",
0382             (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
0383         // Average credit
0384         if (edt_flag)
0385             fprintf(fout, "avg credit_ms:%.03f\n",
0386                 (qstats.sum_credit /
0387                  (qstats.pkts_total + 1.0)) / 1000000.0);
0388         else
0389             fprintf(fout, "avg credit:%d\n",
0390                 (int)(qstats.sum_credit /
0391                       (1500 * ((int)qstats.pkts_total ) + 1)));
0392 
0393         // Return values stats
0394         for (k = 0; k < RET_VAL_COUNT; k++) {
0395             percent_pkts = (qstats.returnValCount[k] * 100.0) /
0396                 (qstats.pkts_total + 1);
0397             fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
0398                 percent_pkts, (int)qstats.returnValCount[k]);
0399         }
0400         fclose(fout);
0401     }
0402 
0403     if (debugFlag)
0404         read_trace_pipe2();
0405     goto cleanup;
0406 
0407 err:
0408     rc = 1;
0409 
0410 cleanup:
0411     bpf_link__destroy(link);
0412     bpf_object__close(obj);
0413 
0414     if (cg1 != -1)
0415         close(cg1);
0416 
0417     if (rc != 0)
0418         cleanup_cgroup_environment();
0419     return rc;
0420 }
0421 
0422 static void Usage(void)
0423 {
0424     printf("This program loads a cgroup skb BPF program to enforce\n"
0425            "cgroup output (egress) bandwidth limits.\n\n"
0426            "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
0427            "           [-s] [-t <secs>] [-w] [-h] [prog]\n"
0428            "  Where:\n"
0429            "    -o         indicates egress direction (default)\n"
0430            "    -d         print BPF trace debug buffer\n"
0431            "    --edt      use fq's Earliest Departure Time\n"
0432            "    -l         also limit flows using loopback\n"
0433            "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n"
0434            "               Default is /hbm1\n"
0435            "    --no_cn    disable CN notifications\n"
0436            "    -r <rate>  Rate in Mbps\n"
0437            "    -s         Update HBM stats\n"
0438            "    -t <time>  Exit after specified seconds (default is 0)\n"
0439            "    -w         Work conserving flag. cgroup can increase\n"
0440            "               bandwidth beyond the rate limit specified\n"
0441            "               while there is available bandwidth. Current\n"
0442            "               implementation assumes there is only eth0\n"
0443            "               but can be extended to support multiple NICs\n"
0444            "    -h         print this info\n"
0445            "    prog       BPF program file name. Name defaults to\n"
0446            "                 hbm_out_kern.o\n");
0447 }
0448 
0449 int main(int argc, char **argv)
0450 {
0451     char *prog = "hbm_out_kern.o";
0452     int  k;
0453     int cg_id = 1;
0454     char *optstring = "iodln:r:st:wh";
0455     struct option loptions[] = {
0456         {"no_cn", 0, NULL, 1},
0457         {"edt", 0, NULL, 2},
0458         {NULL, 0, NULL, 0}
0459     };
0460 
0461     while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
0462         switch (k) {
0463         case 1:
0464             no_cn_flag = true;
0465             break;
0466         case 2:
0467             prog = "hbm_edt_kern.o";
0468             edt_flag = true;
0469             break;
0470         case'o':
0471             break;
0472         case 'd':
0473             debugFlag = true;
0474             break;
0475         case 'l':
0476             loopback_flag = true;
0477             break;
0478         case 'n':
0479             cg_id = atoi(optarg);
0480             break;
0481         case 'r':
0482             minRate = atoi(optarg) * 1.024;
0483             rate = minRate;
0484             break;
0485         case 's':
0486             stats_flag = true;
0487             break;
0488         case 't':
0489             dur = atoi(optarg);
0490             break;
0491         case 'w':
0492             work_conserving_flag = true;
0493             break;
0494         case '?':
0495             if (optopt == 'n' || optopt == 'r' || optopt == 't')
0496                 fprintf(stderr,
0497                     "Option -%c requires an argument.\n\n",
0498                     optopt);
0499         case 'h':
0500             __fallthrough;
0501         default:
0502             Usage();
0503             return 0;
0504         }
0505     }
0506 
0507     if (optind < argc)
0508         prog = argv[optind];
0509     printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
0510 
0511     /* Use libbpf 1.0 API mode */
0512     libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
0513 
0514     return run_bpf_prog(prog, cg_id);
0515 }