0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #define _GNU_SOURCE
0019 #include <stdio.h>
0020 #include <sys/mman.h>
0021 #include <sys/types.h>
0022 #include <sys/wait.h>
0023 #include <sys/ipc.h>
0024 #include <sys/shm.h>
0025 #include <sys/stat.h>
0026 #include <sys/time.h>
0027 #include <linux/futex.h>
0028 #include <unistd.h>
0029 #include <asm/unistd.h>
0030 #include <string.h>
0031 #include <stdlib.h>
0032 #include <fcntl.h>
0033 #include <sched.h>
0034 #include <time.h>
0035 #include <stdarg.h>
0036 #include <pthread.h>
0037 #include <signal.h>
0038 #include <sys/prctl.h>
0039
0040 static inline void dcbf(volatile unsigned int *addr)
0041 {
0042 __asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory");
0043 }
0044
0045 static void err_msg(char *msg)
0046 {
0047
0048 time_t now;
0049 time(&now);
0050 printf("=================================\n");
0051 printf(" Error: %s\n", msg);
0052 printf(" %s", ctime(&now));
0053 printf("=================================\n");
0054 exit(1);
0055 }
0056
0057 static char *map1;
0058 static char *map2;
0059 static pid_t rim_process_pid;
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081 static volatile unsigned int corruption_found;
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092 #define MAX_THREADS 64
0093 #define THREAD_ID_BITS 8
0094 #define THREAD_ID_MASK ((1 << THREAD_ID_BITS) - 1)
0095 static unsigned int rim_thread_ids[MAX_THREADS];
0096 static pthread_t rim_threads[MAX_THREADS];
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108 #define RIM_CHUNK_SIZE 1024
0109 #define BITS_PER_BYTE 8
0110 #define WORD_SIZE (sizeof(unsigned int))
0111 #define WORD_BITS (WORD_SIZE * BITS_PER_BYTE)
0112 #define WORDS_PER_CHUNK (RIM_CHUNK_SIZE/WORD_SIZE)
0113
0114 static inline char *compute_chunk_start_addr(unsigned int thread_id)
0115 {
0116 char *chunk_start;
0117
0118 chunk_start = (char *)((unsigned long)map1 +
0119 (thread_id * RIM_CHUNK_SIZE));
0120
0121 return chunk_start;
0122 }
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132 #define WORD_OFFSET_BITS (__builtin_ctz(WORDS_PER_CHUNK))
0133 #define WORD_OFFSET_MASK ((1 << WORD_OFFSET_BITS) - 1)
0134
0135 static inline unsigned int compute_word_offset(char *start, unsigned int *addr)
0136 {
0137 unsigned int delta_bytes, ret;
0138 delta_bytes = (unsigned long)addr - (unsigned long)start;
0139
0140 ret = delta_bytes/WORD_SIZE;
0141
0142 return ret;
0143 }
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158 #define SWEEP_ID_BITS (WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS))
0159 #define SWEEP_ID_MASK ((1 << SWEEP_ID_BITS) - 1)
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193 #define SWEEP_ID_SHIFT 0
0194 #define WORD_OFFSET_SHIFT (SWEEP_ID_BITS)
0195 #define THREAD_ID_SHIFT (WORD_OFFSET_BITS + SWEEP_ID_BITS)
0196
0197
0198
0199
0200
0201 static inline unsigned int compute_store_pattern(unsigned int tid,
0202 unsigned int *addr,
0203 unsigned int sweep_id)
0204 {
0205 unsigned int ret = 0;
0206 char *start = compute_chunk_start_addr(tid);
0207 unsigned int word_offset = compute_word_offset(start, addr);
0208
0209 ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT;
0210 ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT;
0211 ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT;
0212 return ret;
0213 }
0214
0215
0216 static inline unsigned int extract_tid(unsigned int pattern)
0217 {
0218 unsigned int ret;
0219
0220 ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK;
0221 return ret;
0222 }
0223
0224
0225 static inline unsigned int extract_word_offset(unsigned int pattern)
0226 {
0227 unsigned int ret;
0228
0229 ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK;
0230
0231 return ret;
0232 }
0233
0234
0235 static inline unsigned int extract_sweep_id(unsigned int pattern)
0236
0237 {
0238 unsigned int ret;
0239
0240 ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK;
0241
0242 return ret;
0243 }
0244
0245
0246
0247
0248
0249
0250 #define LOGDIR_NAME_SIZE 100
0251 static char logdir[LOGDIR_NAME_SIZE];
0252
0253 static FILE *fp[MAX_THREADS];
0254 static const char logfilename[] ="Thread-%02d-Chunk";
0255
0256 static inline void start_verification_log(unsigned int tid,
0257 unsigned int *addr,
0258 unsigned int cur_sweep_id,
0259 unsigned int prev_sweep_id)
0260 {
0261 FILE *f;
0262 char logfile[30];
0263 char path[LOGDIR_NAME_SIZE + 30];
0264 char separator[2] = "/";
0265 char *chunk_start = compute_chunk_start_addr(tid);
0266 unsigned int size = RIM_CHUNK_SIZE;
0267
0268 sprintf(logfile, logfilename, tid);
0269 strcpy(path, logdir);
0270 strcat(path, separator);
0271 strcat(path, logfile);
0272 f = fopen(path, "w");
0273
0274 if (!f) {
0275 err_msg("Unable to create logfile\n");
0276 }
0277
0278 fp[tid] = f;
0279
0280 fprintf(f, "----------------------------------------------------------\n");
0281 fprintf(f, "PID = %d\n", rim_process_pid);
0282 fprintf(f, "Thread id = %02d\n", tid);
0283 fprintf(f, "Chunk Start Addr = 0x%016lx\n", (unsigned long)chunk_start);
0284 fprintf(f, "Chunk Size = %d\n", size);
0285 fprintf(f, "Next Store Addr = 0x%016lx\n", (unsigned long)addr);
0286 fprintf(f, "Current sweep-id = 0x%08x\n", cur_sweep_id);
0287 fprintf(f, "Previous sweep-id = 0x%08x\n", prev_sweep_id);
0288 fprintf(f, "----------------------------------------------------------\n");
0289 }
0290
0291 static inline void log_anamoly(unsigned int tid, unsigned int *addr,
0292 unsigned int expected, unsigned int observed)
0293 {
0294 FILE *f = fp[tid];
0295
0296 fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n",
0297 tid, (unsigned long)addr, expected, observed);
0298 fprintf(f, "Thread %02d: Expected Thread id = %02d\n", tid, extract_tid(expected));
0299 fprintf(f, "Thread %02d: Observed Thread id = %02d\n", tid, extract_tid(observed));
0300 fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected));
0301 fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed));
0302 fprintf(f, "Thread %02d: Expected sweep-id = 0x%x\n", tid, extract_sweep_id(expected));
0303 fprintf(f, "Thread %02d: Observed sweep-id = 0x%x\n", tid, extract_sweep_id(observed));
0304 fprintf(f, "----------------------------------------------------------\n");
0305 }
0306
0307 static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
0308 {
0309 FILE *f = fp[tid];
0310 char logfile[30];
0311 char path[LOGDIR_NAME_SIZE + 30];
0312 char separator[] = "/";
0313
0314 fclose(f);
0315
0316 if (nr_anamolies == 0) {
0317 remove(path);
0318 return;
0319 }
0320
0321 sprintf(logfile, logfilename, tid);
0322 strcpy(path, logdir);
0323 strcat(path, separator);
0324 strcat(path, logfile);
0325
0326 printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
0327 tid, nr_anamolies, path);
0328 }
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357 static void verify_chunk(unsigned int tid, unsigned int *next_store_addr,
0358 unsigned int cur_sweep_id,
0359 unsigned int prev_sweep_id)
0360 {
0361 unsigned int *iter_ptr;
0362 unsigned int size = RIM_CHUNK_SIZE;
0363 unsigned int expected;
0364 unsigned int observed;
0365 char *chunk_start = compute_chunk_start_addr(tid);
0366
0367 int nr_anamolies = 0;
0368
0369 start_verification_log(tid, next_store_addr,
0370 cur_sweep_id, prev_sweep_id);
0371
0372 for (iter_ptr = (unsigned int *)chunk_start;
0373 (unsigned long)iter_ptr < (unsigned long)chunk_start + size;
0374 iter_ptr++) {
0375 unsigned int expected_sweep_id;
0376
0377 if (iter_ptr < next_store_addr) {
0378 expected_sweep_id = cur_sweep_id;
0379 } else {
0380 expected_sweep_id = prev_sweep_id;
0381 }
0382
0383 expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id);
0384
0385 dcbf((volatile unsigned int*)iter_ptr);
0386 observed = *iter_ptr;
0387
0388 if (observed != expected) {
0389 nr_anamolies++;
0390 log_anamoly(tid, iter_ptr, expected, observed);
0391 }
0392 }
0393
0394 end_verification_log(tid, nr_anamolies);
0395 }
0396
0397 static void set_pthread_cpu(pthread_t th, int cpu)
0398 {
0399 cpu_set_t run_cpu_mask;
0400 struct sched_param param;
0401
0402 CPU_ZERO(&run_cpu_mask);
0403 CPU_SET(cpu, &run_cpu_mask);
0404 pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask);
0405
0406 param.sched_priority = 1;
0407 if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) {
0408
0409 fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
0410 }
0411 }
0412
0413 static void set_mycpu(int cpu)
0414 {
0415 cpu_set_t run_cpu_mask;
0416 struct sched_param param;
0417
0418 CPU_ZERO(&run_cpu_mask);
0419 CPU_SET(cpu, &run_cpu_mask);
0420 sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask);
0421
0422 param.sched_priority = 1;
0423 if (0 && sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) {
0424 fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
0425 }
0426 }
0427
0428 static volatile int segv_wait;
0429
0430 static void segv_handler(int signo, siginfo_t *info, void *extra)
0431 {
0432 while (segv_wait) {
0433 sched_yield();
0434 }
0435
0436 }
0437
0438 static void set_segv_handler(void)
0439 {
0440 struct sigaction sa;
0441
0442 sa.sa_flags = SA_SIGINFO;
0443 sa.sa_sigaction = segv_handler;
0444
0445 if (sigaction(SIGSEGV, &sa, NULL) == -1) {
0446 perror("sigaction");
0447 exit(EXIT_FAILURE);
0448 }
0449 }
0450
0451 int timeout = 0;
0452
0453
0454
0455
0456
0457
0458 static void *rim_fn(void *arg)
0459 {
0460 unsigned int tid = *((unsigned int *)arg);
0461
0462 int size = RIM_CHUNK_SIZE;
0463 char *chunk_start = compute_chunk_start_addr(tid);
0464
0465 unsigned int prev_sweep_id;
0466 unsigned int cur_sweep_id = 0;
0467
0468
0469 unsigned int pattern = cur_sweep_id;
0470 unsigned int *pattern_ptr = &pattern;
0471 unsigned int *w_ptr, read_data;
0472
0473 set_segv_handler();
0474
0475
0476
0477
0478
0479
0480
0481
0482
0483
0484 for (w_ptr = (unsigned int *)chunk_start;
0485 (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
0486 w_ptr++) {
0487
0488 *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
0489 *w_ptr = *pattern_ptr;
0490 }
0491
0492 while (!corruption_found && !timeout) {
0493 prev_sweep_id = cur_sweep_id;
0494 cur_sweep_id = cur_sweep_id + 1;
0495
0496 for (w_ptr = (unsigned int *)chunk_start;
0497 (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
0498 w_ptr++) {
0499 unsigned int old_pattern;
0500
0501
0502
0503
0504
0505
0506 old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id);
0507
0508
0509
0510
0511
0512 dcbf((volatile unsigned int*)w_ptr);
0513
0514
0515 read_data = *w_ptr;
0516
0517
0518
0519
0520
0521 if (read_data != old_pattern) {
0522
0523 corruption_found = 1;
0524 }
0525
0526
0527
0528
0529
0530 if (corruption_found || timeout) {
0531
0532
0533
0534
0535
0536
0537
0538
0539 verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id);
0540
0541 return 0;
0542 }
0543
0544
0545
0546
0547
0548 *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
0549
0550
0551
0552
0553
0554 *w_ptr = *pattern_ptr;
0555 }
0556 }
0557
0558 return NULL;
0559 }
0560
0561
0562 static unsigned long start_cpu = 0;
0563 static unsigned long nrthreads = 4;
0564
0565 static pthread_t mem_snapshot_thread;
0566
0567 static void *mem_snapshot_fn(void *arg)
0568 {
0569 int page_size = getpagesize();
0570 size_t size = page_size;
0571 void *tmp = malloc(size);
0572
0573 while (!corruption_found && !timeout) {
0574
0575 segv_wait = 1;
0576
0577 mprotect(map1, size, PROT_READ);
0578
0579
0580
0581
0582
0583 memcpy(tmp, map1, size);
0584
0585
0586
0587
0588
0589
0590
0591 memcpy(map2, tmp, size);
0592
0593
0594
0595
0596 asm volatile("sync" ::: "memory");
0597 mprotect(map1, size, PROT_READ|PROT_WRITE);
0598 asm volatile("sync" ::: "memory");
0599 segv_wait = 0;
0600
0601 usleep(1);
0602 }
0603
0604 return 0;
0605 }
0606
0607 void alrm_sighandler(int sig)
0608 {
0609 timeout = 1;
0610 }
0611
0612 int main(int argc, char *argv[])
0613 {
0614 int c;
0615 int page_size = getpagesize();
0616 time_t now;
0617 int i, dir_error;
0618 pthread_attr_t attr;
0619 key_t shm_key = (key_t) getpid();
0620 int shmid, run_time = 20 * 60;
0621 struct sigaction sa_alrm;
0622
0623 snprintf(logdir, LOGDIR_NAME_SIZE,
0624 "/tmp/logdir-%u", (unsigned int)getpid());
0625 while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) {
0626 switch(c) {
0627 case 'r':
0628 start_cpu = strtoul(optarg, NULL, 10);
0629 break;
0630 case 'h':
0631 printf("%s [-r <start_cpu>] [-n <nrthreads>] [-l <logdir>] [-t <timeout>]\n", argv[0]);
0632 exit(0);
0633 break;
0634 case 'n':
0635 nrthreads = strtoul(optarg, NULL, 10);
0636 break;
0637 case 'l':
0638 strncpy(logdir, optarg, LOGDIR_NAME_SIZE - 1);
0639 break;
0640 case 't':
0641 run_time = strtoul(optarg, NULL, 10);
0642 break;
0643 default:
0644 printf("invalid option\n");
0645 exit(0);
0646 break;
0647 }
0648 }
0649
0650 if (nrthreads > MAX_THREADS)
0651 nrthreads = MAX_THREADS;
0652
0653 shmid = shmget(shm_key, page_size, IPC_CREAT|0666);
0654 if (shmid < 0) {
0655 err_msg("Failed shmget\n");
0656 }
0657
0658 map1 = shmat(shmid, NULL, 0);
0659 if (map1 == (void *) -1) {
0660 err_msg("Failed shmat");
0661 }
0662
0663 map2 = shmat(shmid, NULL, 0);
0664 if (map2 == (void *) -1) {
0665 err_msg("Failed shmat");
0666 }
0667
0668 dir_error = mkdir(logdir, 0755);
0669
0670 if (dir_error) {
0671 err_msg("Failed mkdir");
0672 }
0673
0674 printf("start_cpu list:%lu\n", start_cpu);
0675 printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads);
0676 printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2);
0677 printf("logdir at : %s\n", logdir);
0678 printf("Timeout: %d seconds\n", run_time);
0679
0680 time(&now);
0681 printf("=================================\n");
0682 printf(" Starting Test\n");
0683 printf(" %s", ctime(&now));
0684 printf("=================================\n");
0685
0686 for (i = 0; i < nrthreads; i++) {
0687 if (1 && !fork()) {
0688 prctl(PR_SET_PDEATHSIG, SIGKILL);
0689 set_mycpu(start_cpu + i);
0690 for (;;)
0691 sched_yield();
0692 exit(0);
0693 }
0694 }
0695
0696
0697 sa_alrm.sa_handler = &alrm_sighandler;
0698 sigemptyset(&sa_alrm.sa_mask);
0699 sa_alrm.sa_flags = 0;
0700
0701 if (sigaction(SIGALRM, &sa_alrm, 0) == -1) {
0702 err_msg("Failed signal handler registration\n");
0703 }
0704
0705 alarm(run_time);
0706
0707 pthread_attr_init(&attr);
0708 for (i = 0; i < nrthreads; i++) {
0709 rim_thread_ids[i] = i;
0710 pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]);
0711 set_pthread_cpu(rim_threads[i], start_cpu + i);
0712 }
0713
0714 pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1);
0715 set_pthread_cpu(mem_snapshot_thread, start_cpu + i);
0716
0717
0718 pthread_join(mem_snapshot_thread, NULL);
0719 for (i = 0; i < nrthreads; i++) {
0720 pthread_join(rim_threads[i], NULL);
0721 }
0722
0723 if (!timeout) {
0724 time(&now);
0725 printf("=================================\n");
0726 printf(" Data Corruption Detected\n");
0727 printf(" %s", ctime(&now));
0728 printf(" See logfiles in %s\n", logdir);
0729 printf("=================================\n");
0730 return 1;
0731 }
0732 return 0;
0733 }