0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032 #define _GNU_SOURCE
0033
0034 #include <stdio.h>
0035 #include <stdlib.h>
0036 #include <assert.h>
0037 #include <sys/time.h>
0038 #include <unistd.h>
0039 #include <errno.h>
0040 #include <fcntl.h>
0041 #include <linux/unistd.h>
0042 #include <linux/compiler.h>
0043
0044 #include <linux/bpf.h>
0045 #include <bpf/bpf.h>
0046 #include <getopt.h>
0047
0048 #include "cgroup_helpers.h"
0049 #include "hbm.h"
0050 #include "bpf_util.h"
0051 #include <bpf/libbpf.h>
0052
0053 bool outFlag = true;
0054 int minRate = 1000;
0055 int rate = 1000;
0056 int dur = 1;
0057 bool stats_flag;
0058 bool loopback_flag;
0059 bool debugFlag;
0060 bool work_conserving_flag;
0061 bool no_cn_flag;
0062 bool edt_flag;
0063
0064 static void Usage(void);
0065 static void read_trace_pipe2(void);
0066 static void do_error(char *msg, bool errno_flag);
0067
0068 #define DEBUGFS "/sys/kernel/debug/tracing/"
0069
0070 static struct bpf_program *bpf_prog;
0071 static struct bpf_object *obj;
0072 static int queue_stats_fd;
0073
0074 static void read_trace_pipe2(void)
0075 {
0076 int trace_fd;
0077 FILE *outf;
0078 char *outFname = "hbm_out.log";
0079
0080 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
0081 if (trace_fd < 0) {
0082 printf("Error opening trace_pipe\n");
0083 return;
0084 }
0085
0086
0087
0088
0089 outf = fopen(outFname, "w");
0090
0091 if (outf == NULL)
0092 printf("Error creating %s\n", outFname);
0093
0094 while (1) {
0095 static char buf[4097];
0096 ssize_t sz;
0097
0098 sz = read(trace_fd, buf, sizeof(buf) - 1);
0099 if (sz > 0) {
0100 buf[sz] = 0;
0101 puts(buf);
0102 if (outf != NULL) {
0103 fprintf(outf, "%s\n", buf);
0104 fflush(outf);
0105 }
0106 }
0107 }
0108 }
0109
0110 static void do_error(char *msg, bool errno_flag)
0111 {
0112 if (errno_flag)
0113 printf("ERROR: %s, errno: %d\n", msg, errno);
0114 else
0115 printf("ERROR: %s\n", msg);
0116 exit(1);
0117 }
0118
0119 static int prog_load(char *prog)
0120 {
0121 struct bpf_program *pos;
0122 const char *sec_name;
0123
0124 obj = bpf_object__open_file(prog, NULL);
0125 if (libbpf_get_error(obj)) {
0126 printf("ERROR: opening BPF object file failed\n");
0127 return 1;
0128 }
0129
0130
0131 if (bpf_object__load(obj)) {
0132 printf("ERROR: loading BPF object file failed\n");
0133 goto err;
0134 }
0135
0136 bpf_object__for_each_program(pos, obj) {
0137 sec_name = bpf_program__section_name(pos);
0138 if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) {
0139 bpf_prog = pos;
0140 break;
0141 }
0142 }
0143 if (!bpf_prog) {
0144 printf("ERROR: finding a prog in obj file failed\n");
0145 goto err;
0146 }
0147
0148 queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats");
0149 if (queue_stats_fd < 0) {
0150 printf("ERROR: finding a map in obj file failed\n");
0151 goto err;
0152 }
0153
0154 return 0;
0155
0156 err:
0157 bpf_object__close(obj);
0158 return 1;
0159 }
0160
0161 static int run_bpf_prog(char *prog, int cg_id)
0162 {
0163 struct hbm_queue_stats qstats = {0};
0164 char cg_dir[100], cg_pin_path[100];
0165 struct bpf_link *link = NULL;
0166 int key = 0;
0167 int cg1 = 0;
0168 int rc = 0;
0169
0170 sprintf(cg_dir, "/hbm%d", cg_id);
0171 rc = prog_load(prog);
0172 if (rc != 0)
0173 return rc;
0174
0175 if (setup_cgroup_environment()) {
0176 printf("ERROR: setting cgroup environment\n");
0177 goto err;
0178 }
0179 cg1 = create_and_get_cgroup(cg_dir);
0180 if (!cg1) {
0181 printf("ERROR: create_and_get_cgroup\n");
0182 goto err;
0183 }
0184 if (join_cgroup(cg_dir)) {
0185 printf("ERROR: join_cgroup\n");
0186 goto err;
0187 }
0188
0189 qstats.rate = rate;
0190 qstats.stats = stats_flag ? 1 : 0;
0191 qstats.loopback = loopback_flag ? 1 : 0;
0192 qstats.no_cn = no_cn_flag ? 1 : 0;
0193 if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) {
0194 printf("ERROR: Could not update map element\n");
0195 goto err;
0196 }
0197
0198 if (!outFlag)
0199 bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS);
0200
0201 link = bpf_program__attach_cgroup(bpf_prog, cg1);
0202 if (libbpf_get_error(link)) {
0203 fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n");
0204 goto err;
0205 }
0206
0207 sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id);
0208 rc = bpf_link__pin(link, cg_pin_path);
0209 if (rc < 0) {
0210 printf("ERROR: bpf_link__pin failed: %d\n", rc);
0211 goto err;
0212 }
0213
0214 if (work_conserving_flag) {
0215 struct timeval t0, t_last, t_new;
0216 FILE *fin;
0217 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
0218 signed long long last_cg_tx_bytes, new_cg_tx_bytes;
0219 signed long long delta_time, delta_bytes, delta_rate;
0220 int delta_ms;
0221 #define DELTA_RATE_CHECK 10000
0222 #define RATE_THRESHOLD 9500000000
0223
0224 bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
0225 if (gettimeofday(&t0, NULL) < 0)
0226 do_error("gettimeofday failed", true);
0227 t_last = t0;
0228 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
0229 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
0230 do_error("fscanf fails", false);
0231 fclose(fin);
0232 last_cg_tx_bytes = qstats.bytes_total;
0233 while (true) {
0234 usleep(DELTA_RATE_CHECK);
0235 if (gettimeofday(&t_new, NULL) < 0)
0236 do_error("gettimeofday failed", true);
0237 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
0238 (t_new.tv_usec - t0.tv_usec)/1000;
0239 if (delta_ms > dur * 1000)
0240 break;
0241 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
0242 (t_new.tv_usec - t_last.tv_usec);
0243 if (delta_time == 0)
0244 continue;
0245 t_last = t_new;
0246 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
0247 "r");
0248 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
0249 do_error("fscanf fails", false);
0250 fclose(fin);
0251 printf(" new_eth_tx_bytes:%llu\n",
0252 new_eth_tx_bytes);
0253 bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
0254 new_cg_tx_bytes = qstats.bytes_total;
0255 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
0256 last_eth_tx_bytes = new_eth_tx_bytes;
0257 delta_rate = (delta_bytes * 8000000) / delta_time;
0258 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
0259 delta_ms, delta_rate/1000000000.0,
0260 rate/1000.0);
0261 if (delta_rate < RATE_THRESHOLD) {
0262
0263
0264
0265
0266
0267 int rate_diff100;
0268
0269 delta_bytes = new_cg_tx_bytes -
0270 last_cg_tx_bytes;
0271 last_cg_tx_bytes = new_cg_tx_bytes;
0272 delta_rate = (delta_bytes * 8000000) /
0273 delta_time;
0274 printf(" rate:%.3fGbps",
0275 delta_rate/1000000000.0);
0276 rate_diff100 = (((long long)rate)*1000000 -
0277 delta_rate) * 100 /
0278 (((long long) rate) * 1000000);
0279 printf(" rdiff:%d", rate_diff100);
0280 if (rate_diff100 <= 3) {
0281 rate += (rate >> 4);
0282 if (rate > RATE_THRESHOLD / 1000000)
0283 rate = RATE_THRESHOLD / 1000000;
0284 qstats.rate = rate;
0285 printf(" INC\n");
0286 } else {
0287 printf("\n");
0288 }
0289 } else {
0290
0291
0292
0293
0294 printf(" DEC\n");
0295 rate -= (rate >> 3);
0296 if (rate < minRate)
0297 rate = minRate;
0298 qstats.rate = rate;
0299 }
0300 if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY))
0301 do_error("update map element fails", false);
0302 }
0303 } else {
0304 sleep(dur);
0305 }
0306
0307 if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) {
0308 char fname[100];
0309 FILE *fout;
0310
0311 if (!outFlag)
0312 sprintf(fname, "hbm.%d.in", cg_id);
0313 else
0314 sprintf(fname, "hbm.%d.out", cg_id);
0315 fout = fopen(fname, "w");
0316 fprintf(fout, "id:%d\n", cg_id);
0317 fprintf(fout, "ERROR: Could not lookup queue_stats\n");
0318 } else if (stats_flag && qstats.lastPacketTime >
0319 qstats.firstPacketTime) {
0320 long long delta_us = (qstats.lastPacketTime -
0321 qstats.firstPacketTime)/1000;
0322 unsigned int rate_mbps = ((qstats.bytes_total -
0323 qstats.bytes_dropped) * 8 /
0324 delta_us);
0325 double percent_pkts, percent_bytes;
0326 char fname[100];
0327 FILE *fout;
0328 int k;
0329 static const char *returnValNames[] = {
0330 "DROP_PKT",
0331 "ALLOW_PKT",
0332 "DROP_PKT_CWR",
0333 "ALLOW_PKT_CWR"
0334 };
0335 #define RET_VAL_COUNT 4
0336
0337
0338
0339
0340
0341 sprintf(fname, "hbm.%d.out", cg_id);
0342 fout = fopen(fname, "w");
0343 fprintf(fout, "id:%d\n", cg_id);
0344 fprintf(fout, "rate_mbps:%d\n", rate_mbps);
0345 fprintf(fout, "duration:%.1f secs\n",
0346 (qstats.lastPacketTime - qstats.firstPacketTime) /
0347 1000000000.0);
0348 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
0349 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
0350 1000000));
0351 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
0352 fprintf(fout, "bytes_dropped_MB:%d\n",
0353 (int)(qstats.bytes_dropped /
0354 1000000));
0355
0356 percent_pkts = (qstats.pkts_marked * 100.0) /
0357 (qstats.pkts_total + 1);
0358 percent_bytes = (qstats.bytes_marked * 100.0) /
0359 (qstats.bytes_total + 1);
0360 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
0361 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
0362
0363
0364 percent_pkts = (qstats.pkts_dropped * 100.0) /
0365 (qstats.pkts_total + 1);
0366 percent_bytes = (qstats.bytes_dropped * 100.0) /
0367 (qstats.bytes_total + 1);
0368 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
0369 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
0370
0371
0372 percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
0373 (qstats.pkts_total + 1);
0374 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
0375 (int)qstats.pkts_ecn_ce);
0376
0377
0378 fprintf(fout, "avg cwnd:%d\n",
0379 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
0380
0381 fprintf(fout, "avg rtt:%d\n",
0382 (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
0383
0384 if (edt_flag)
0385 fprintf(fout, "avg credit_ms:%.03f\n",
0386 (qstats.sum_credit /
0387 (qstats.pkts_total + 1.0)) / 1000000.0);
0388 else
0389 fprintf(fout, "avg credit:%d\n",
0390 (int)(qstats.sum_credit /
0391 (1500 * ((int)qstats.pkts_total ) + 1)));
0392
0393
0394 for (k = 0; k < RET_VAL_COUNT; k++) {
0395 percent_pkts = (qstats.returnValCount[k] * 100.0) /
0396 (qstats.pkts_total + 1);
0397 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
0398 percent_pkts, (int)qstats.returnValCount[k]);
0399 }
0400 fclose(fout);
0401 }
0402
0403 if (debugFlag)
0404 read_trace_pipe2();
0405 goto cleanup;
0406
0407 err:
0408 rc = 1;
0409
0410 cleanup:
0411 bpf_link__destroy(link);
0412 bpf_object__close(obj);
0413
0414 if (cg1 != -1)
0415 close(cg1);
0416
0417 if (rc != 0)
0418 cleanup_cgroup_environment();
0419 return rc;
0420 }
0421
0422 static void Usage(void)
0423 {
0424 printf("This program loads a cgroup skb BPF program to enforce\n"
0425 "cgroup output (egress) bandwidth limits.\n\n"
0426 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
0427 " [-s] [-t <secs>] [-w] [-h] [prog]\n"
0428 " Where:\n"
0429 " -o indicates egress direction (default)\n"
0430 " -d print BPF trace debug buffer\n"
0431 " --edt use fq's Earliest Departure Time\n"
0432 " -l also limit flows using loopback\n"
0433 " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
0434 " Default is /hbm1\n"
0435 " --no_cn disable CN notifications\n"
0436 " -r <rate> Rate in Mbps\n"
0437 " -s Update HBM stats\n"
0438 " -t <time> Exit after specified seconds (default is 0)\n"
0439 " -w Work conserving flag. cgroup can increase\n"
0440 " bandwidth beyond the rate limit specified\n"
0441 " while there is available bandwidth. Current\n"
0442 " implementation assumes there is only eth0\n"
0443 " but can be extended to support multiple NICs\n"
0444 " -h print this info\n"
0445 " prog BPF program file name. Name defaults to\n"
0446 " hbm_out_kern.o\n");
0447 }
0448
0449 int main(int argc, char **argv)
0450 {
0451 char *prog = "hbm_out_kern.o";
0452 int k;
0453 int cg_id = 1;
0454 char *optstring = "iodln:r:st:wh";
0455 struct option loptions[] = {
0456 {"no_cn", 0, NULL, 1},
0457 {"edt", 0, NULL, 2},
0458 {NULL, 0, NULL, 0}
0459 };
0460
0461 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
0462 switch (k) {
0463 case 1:
0464 no_cn_flag = true;
0465 break;
0466 case 2:
0467 prog = "hbm_edt_kern.o";
0468 edt_flag = true;
0469 break;
0470 case'o':
0471 break;
0472 case 'd':
0473 debugFlag = true;
0474 break;
0475 case 'l':
0476 loopback_flag = true;
0477 break;
0478 case 'n':
0479 cg_id = atoi(optarg);
0480 break;
0481 case 'r':
0482 minRate = atoi(optarg) * 1.024;
0483 rate = minRate;
0484 break;
0485 case 's':
0486 stats_flag = true;
0487 break;
0488 case 't':
0489 dur = atoi(optarg);
0490 break;
0491 case 'w':
0492 work_conserving_flag = true;
0493 break;
0494 case '?':
0495 if (optopt == 'n' || optopt == 'r' || optopt == 't')
0496 fprintf(stderr,
0497 "Option -%c requires an argument.\n\n",
0498 optopt);
0499 case 'h':
0500 __fallthrough;
0501 default:
0502 Usage();
0503 return 0;
0504 }
0505 }
0506
0507 if (optind < argc)
0508 prog = argv[optind];
0509 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
0510
0511
0512 libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
0513
0514 return run_bpf_prog(prog, cg_id);
0515 }