rendered paste bodyst to prove an optinmization in function get_singleton has bug// dew to the cache-unconscious programming// Ekin for Vladimir Chebotarev// Moscow, 2012// no rights reserved// run: g++ test.cpp -lpthread && ./a.out 16 10 0.005#include <stdlib.h>#include <pthread.h>#include <time.h>#include <stdio.h>#include <math.h>#define THREAD_MAX 1024#define CPU_NUM 16#define OOPS(A) do { \ fprintf(stderr, "oops: local = %p;" #A " = %p; thread #%d\n" \ , t->local, A, t->num); \ exit(__LINE__); } while(0)struct tls { void* local; // thread's view of singleton int num; int iter_count; pthread_t thread; cpu_set_t cpu_set; timespec pause_time_requested; timespec pause_time_remaining; double busy; };struct tls_elem { tls data; char pad[0x1000 - sizeof(data)]; };tls_elem* tlses = 0;pthread_mutex_t mutex;pthread_barrier_t barrier;static void* volatile singleton = 0; // master copy of singletonvoid* get_singleton() { if (!singleton) { pthread_mutex_lock(&mutex); if (!singleton) singleton = malloc(sizeof(int)); pthread_mutex_unlock(&mutex); } return singleton; }void* run(void* obj) { tls* t = static_cast<tls*>(obj); while(t->iter_count--) { //fprintf(stderr, "thread #%4d started iter = #%d\n", t->num, t->iter_count); //nanosleep(&t->pause_time_requested, &t->pause_time_remaining); for (int i = 0; i < 10000; ++i) t->busy += sin((double)i); pthread_barrier_wait(&barrier); void* gotten = get_singleton(); if (!t->local) t->local = gotten; else if (t->local != gotten) OOPS(gotten); //fprintf(stderr, "thread #%4d finished iter = #%d\n", t->num, t->iter_count); pthread_barrier_wait(&barrier); } return obj; }int main (int argc, char* argv[]) { if (argc != 4) { fprintf(stderr, "USAGE: %s thread_count iter_count pause_time\n" , argv[0]); exit(__LINE__); } int thread_count = atoi(argv[1]); int iter_count = atoi(argv[2]); double pause_time = atof(argv[3]); time_t pause_sec = (time_t)pause_time; long pause_nanosec = (long)(1e9*(pause_time - pause_sec)); if (!(thread_count && iter_count && thread_count < THREAD_MAX)) { fprintf(stderr, "INVALID ARGUMENT\n"); exit(__LINE__); } pthread_mutex_init(&mutex, 0); pthread_barrier_init(&barrier, 0, thread_count + 1); // "+1" because of the main thread will wait too tlses = static_cast<tls_elem*>(malloc(thread_count * sizeof(tls_elem))); fprintf(stderr, "TEST STARTED\n"); int i; for (i = 0; i < thread_count; ++i) { tls* t = &(tlses + i)->data; t->local = 0; t->num = i; CPU_ZERO(&t->cpu_set); CPU_SET(i % CPU_NUM, &t->cpu_set); t->iter_count = iter_count; t->pause_time_requested.tv_sec = pause_sec; t->pause_time_requested.tv_nsec = pause_nanosec; t->busy = 0.; pthread_create(&t->thread, 0, run, t); pthread_setaffinity_np(t->thread, sizeof(t->cpu_set), &t->cpu_set); } while(iter_count--) { fprintf(stderr, "iter #%d started\n", iter_count); pthread_barrier_wait(&barrier); pthread_barrier_wait(&barrier); //fprintf(stderr, "iter #%d finished\n", iter_count); for (i = 0; i < thread_count; ++i) { tls* t = &(tlses + i)->data; if (t->local != singleton || !t->local) OOPS(singleton); t->local = 0; } free(singleton); singleton = 0; tls* t = &tlses->data; //fprintf(stderr, " main thread pause iter #%d\n", iter_count); nanosleep(&t->pause_time_requested, &t->pause_time_remaining);} fprintf(stderr, "SUCCESS (no bug)\n"); for (i = 0; i < thread_count; ++i) pthread_join((tlses + i)->data.thread, 0); free(tlses); return 0; }