Newer
Older
* Copyright 2013-2016 Formal Methods and Tools, University of Twente
* Copyright 2016-2017 Tom van Dijk, Johannes Kepler University Linz
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <errno.h> // for errno
#include <sched.h> // for sched_getaffinity
#include <stdio.h> // for fprintf
#include <stdlib.h> // for memalign, malloc
#include <string.h> // for memset
#include <sys/mman.h> // for mprotect
#include <sys/time.h> // for gettimeofday
#include <pthread.h>
#include <unistd.h>
#include <assert.h>
#include <lace.h>
static hwloc_topology_t topo;
static unsigned int n_nodes, n_cores, n_pus;
#endif
/**
* (public) Worker data
*/
static Worker **workers = NULL;
/**
* Default sizes for program stack and task deque
*/
static size_t default_stacksize = 0; // 0 means "set by lace_init"
static size_t default_dqsize = 100000;
/**
* Verbosity flag, set with lace_set_verbosity
*/
/**
* Number of workers and number of enabled/active workers
*/
static unsigned int n_workers = 0;
static unsigned int enabled_workers = 0;
/**
* Datastructure of the task deque etc for each worker.
* - first public cachelines (accessible via global "workers" variable)
* - then private cachelines
* - then the deque array
*/
typedef struct {
Worker worker_public;
char pad1[PAD(sizeof(Worker), LINE_SIZE)];
WorkerP worker_private;
char pad2[PAD(sizeof(WorkerP), LINE_SIZE)];
Task deque[];
} worker_data;
/**
* (Secret) holds pointers to the memory block allocated for each worker
*/
static worker_data **workers_memory = NULL;
/**
* Number of bytes allocated for each worker's worker data.
*/
static size_t workers_memory_size = 0;
/**
* (Secret) holds pointer to private Worker data, just for stats collection at end
*/
/**
* Thread-specific mechanism to access current worker data
*/
#ifdef __linux__ // use gcc thread-local storage (i.e. __thread variables)
static __thread WorkerP *current_worker;
#else
static pthread_key_t worker_key;
#endif
/**
* worker_attr used for creating threads
* - initialized by lace_init
* - used by lace_spawn_worker
*/
/**
* The condition/mutex pair for when the root thread sleeps until the end of the program
*/
static pthread_cond_t wait_until_done = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t wait_until_done_mutex = PTHREAD_MUTEX_INITIALIZER;
/**
* Data structure that contains the stack and stack size for each worker.
*/
struct lace_worker_init
{
void* stack;
size_t stacksize;
};
static struct lace_worker_init *workers_init;
/**
* Global newframe variable used for the implementation of NEWFRAME and TOGETHER
*/
/**
* Get the private Worker data of the current thread
*/
WorkerP*
lace_get_worker()
{
#ifdef __linux__
return current_worker;
#else
return (WorkerP*)pthread_getspecific(worker_key);
#endif
}
/**
* Find the head of the task deque, using the given private Worker data
*/
Task*
lace_get_head(WorkerP *self)
{
Task *dq = self->dq;
if (dq[0].thief == 0) return dq;
if (dq[1].thief == 0) return dq+1;
if (dq[2].thief == 0) return dq+2;
/* Then fast search for a low/high bound using powers of 2: 4, 8, 16... */
size_t low = 2;
size_t high = self->end - self->dq;
for (;;) {
if (low*2 >= high) {
break;
} else if (dq[low*2].thief == 0) {
high=low*2;
break;
} else {
low*=2;
}
}
while (low < high) {
size_t mid = low + (high-low)/2;
if (dq[mid].thief == 0) high = mid;
else low = mid + 1;
}
return dq+low;
}
/**
* Get the default stack size (or 0 for automatically determine)
*/
size_t
lace_default_stacksize()
{
return default_stacksize;
}
/**
* If we are collecting PIE times, then we need some helper functions.
*/
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#if LACE_PIE_TIMES
static uint64_t count_at_start, count_at_end;
static long long unsigned us_elapsed_timer;
static void
us_elapsed_start(void)
{
struct timeval now;
gettimeofday(&now, NULL);
us_elapsed_timer = now.tv_sec * 1000000LL + now.tv_usec;
}
static long long unsigned
us_elapsed(void)
{
struct timeval now;
long long unsigned t;
gettimeofday( &now, NULL );
t = now.tv_sec * 1000000LL + now.tv_usec;
return t - us_elapsed_timer;
}
#endif
/**
* Lace barrier implementation, that synchronizes on all currently enabled workers.
*/
typedef struct {
volatile int __attribute__((aligned(LINE_SIZE))) count;
volatile int __attribute__((aligned(LINE_SIZE))) wait;
} barrier_t;
barrier_t lace_bar;
/**
* Enter the Lace barrier and wait until all workers have entered the Lace barrier.
*/
void
lace_barrier()
{
int wait = lace_bar.wait;
if ((int)enabled_workers == __sync_add_and_fetch(&lace_bar.count, 1)) {
lace_bar.count = 0;
lace_bar.leaving = enabled_workers;
lace_bar.wait = 1 - wait; // flip wait
} else {
while (wait == lace_bar.wait) {} // wait
}
static void
lace_barrier_init()
{
memset(&lace_bar, 0, sizeof(barrier_t));
}
/**
* Destroy the Lace barrier (just wait until all are exited)
*/
static void
lace_barrier_destroy()
{
// wait for all to exit
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
while (lace_bar.leaving != 0) continue;
}
/**
* For debugging purposes, check if memory is allocated on the correct memory nodes.
*/
static void __attribute__((unused))
lace_check_memory(void)
{
#if LACE_USE_HWLOC
// get our current worker
WorkerP *w = lace_get_worker();
void* mem = workers_memory[w->worker];
// get pinned PUs
hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
// get nodes of pinned PUs
hwloc_nodeset_t cpunodes = hwloc_bitmap_alloc();
hwloc_cpuset_to_nodeset(topo, cpuset, cpunodes);
// get location of memory
hwloc_nodeset_t memlocation = hwloc_bitmap_alloc();
#ifdef hwloc_get_area_memlocation
hwloc_get_area_memlocation(topo, mem, sizeof(worker_data), memlocation, HWLOC_MEMBIND_BYNODESET);
#else
hwloc_membind_policy_t policy;
int res = hwloc_get_area_membind_nodeset(topo, mem, sizeof(worker_data), memlocation, &policy, HWLOC_MEMBIND_STRICT);
if (res == -1) {
fprintf(stderr, "Lace warning: hwloc_get_area_membind_nodeset returned -1!\n");
if (policy != HWLOC_MEMBIND_BIND) {
fprintf(stderr, "Lace warning: Lace worker memory not bound with BIND policy!\n");
}
#endif
// check if CPU and node are on the same place
if (!hwloc_bitmap_isincluded(memlocation, cpunodes)) {
fprintf(stderr, "Lace warning: Lace thread not on same memory domain as data!\n");
char *strp, *strp2, *strp3;
hwloc_bitmap_list_asprintf(&strp, cpuset);
hwloc_bitmap_list_asprintf(&strp2, cpunodes);
hwloc_bitmap_list_asprintf(&strp3, memlocation);
fprintf(stderr, "Worker %d is pinned on PUs %s, node %s; memory is pinned on node %s\n", w->worker, strp, strp2, strp3);
free(strp);
free(strp2);
free(strp3);
}
// free allocated memory
hwloc_bitmap_free(cpuset);
hwloc_bitmap_free(cpunodes);
hwloc_bitmap_free(memlocation);
#endif
#if LACE_USE_HWLOC
// Get our worker
unsigned int worker = lace_get_worker()->worker;
// Get our core (hwloc object)
hwloc_obj_t pu = hwloc_get_obj_by_type(topo, HWLOC_OBJ_CORE, worker % n_cores);
// Get our copy of the bitmap
hwloc_cpuset_t bmp = hwloc_bitmap_dup(pu->cpuset);
// Get number of PUs in bitmap
int n = -1, count=0;
while ((n=hwloc_bitmap_next(bmp, n)) != -1) count++;
// Check if we actually have any logical processors
if (count == 0) {
fprintf(stderr, "Lace error: trying to pin a worker on an empty core?\n");
exit(-1);
}
// Select the correct PU on the core (in case of hyperthreading)
int idx = worker / n_cores;
if (idx >= count) {
fprintf(stderr, "Lace warning: more workers than available logical processors!\n");
idx %= count;
}
// Find index of PU and restrict bitmap
n = -1;
for (int i=0; i<=idx; i++) n = hwloc_bitmap_next(bmp, n);
hwloc_bitmap_only(bmp, n);
if (hwloc_set_cpubind(topo, bmp, HWLOC_CPUBIND_THREAD) == -1) {
fprintf(stderr, "Lace warning: hwloc_set_cpubind returned -1!\n");
}
// Free our copy of the bitmap
hwloc_bitmap_free(bmp);
// Pin the memory area (using the appropriate hwloc function)
#ifdef HWLOC_MEMBIND_BYNODESET
int res = hwloc_set_area_membind(topo, workers_memory[worker], workers_memory_size, pu->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE | HWLOC_MEMBIND_BYNODESET);
#else
int res = hwloc_set_area_membind_nodeset(topo, workers_memory[worker], workers_memory_size, pu->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE);
#endif
if (res != 0) {
fprintf(stderr, "Lace error: Unable to bind worker memory to node!\n");
}
// Check if everything is on the correct node
lace_check_memory();
#endif
}
void
lace_init_worker(unsigned int worker)
{
// Allocate our memory
workers_memory[worker] = mmap(NULL, workers_memory_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (workers_memory[worker] == MAP_FAILED) {
fprintf(stderr, "Lace error: Unable to allocate memory for the Lace worker!\n");
exit(1);
}
// Set pointers
Worker *wt = workers[worker] = &workers_memory[worker]->worker_public;
WorkerP *w = workers_p[worker] = &workers_memory[worker]->worker_private;
w->dq = workers_memory[worker]->deque;
#ifdef __linux__
current_worker = w;
#endif
// Initialize public worker data
wt->dq = w->dq;
wt->ts.v = 0;
wt->allstolen = 0;
wt->movesplit = 0;
// Initialize private worker data
w->_public = wt;
w->split = w->dq;
w->allstolen = 0;
w->worker = worker;
#else
w->pu = -1;
#endif
w->enabled = 1;
if (workers_init[worker].stack != 0) {
w->stack_trigger = ((size_t)workers_init[worker].stack) + workers_init[worker].stacksize/20;
} else {
w->stack_trigger = 0;
}
{ int k; for (k=0; k<CTR_MAX; k++) w->ctr[k] = 0; }
#endif
// Synchronize with others
lace_barrier();
#if LACE_PIE_TIMES
w->time = gethrtime();
w->level = 0;
#endif
/**
* Some OSX systems do not implement pthread_barrier_t, so we provide an implementation here.
*/
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
#if defined(__APPLE__) && !defined(pthread_barrier_t)
typedef int pthread_barrierattr_t;
typedef struct
{
pthread_mutex_t mutex;
pthread_cond_t cond;
int count;
int tripCount;
} pthread_barrier_t;
static int
pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned int count)
{
if(count == 0)
{
errno = EINVAL;
return -1;
}
if(pthread_mutex_init(&barrier->mutex, 0) < 0)
{
return -1;
}
if(pthread_cond_init(&barrier->cond, 0) < 0)
{
pthread_mutex_destroy(&barrier->mutex);
return -1;
}
barrier->tripCount = count;
barrier->count = 0;
return 0;
(void)attr;
}
static int
pthread_barrier_destroy(pthread_barrier_t *barrier)
{
pthread_cond_destroy(&barrier->cond);
pthread_mutex_destroy(&barrier->mutex);
return 0;
}
static int
pthread_barrier_wait(pthread_barrier_t *barrier)
{
pthread_mutex_lock(&barrier->mutex);
++(barrier->count);
if(barrier->count >= barrier->tripCount)
{
barrier->count = 0;
pthread_cond_broadcast(&barrier->cond);
pthread_mutex_unlock(&barrier->mutex);
return 1;
}
else
{
pthread_cond_wait(&barrier->cond, &(barrier->mutex));
pthread_mutex_unlock(&barrier->mutex);
return 0;
}
}
#endif // defined(__APPLE__) && !defined(pthread_barrier_t)
static pthread_barrier_t suspend_barrier;
static volatile int must_suspend = 0, suspended = 0;
void
lace_suspend()
{
if (suspended == 0) {
suspended = 1;
must_suspend = 1;
lace_barrier();
must_suspend = 0;
}
}
void
lace_resume()
{
if (suspended == 1) {
suspended = 0;
pthread_barrier_wait(&suspend_barrier);
}
}
/**
* Disable worker <worker>.
* If the given worker is the current worker, this function does nothing.
if (worker == self) return;
if (workers_p[worker]->enabled == 1) {
workers_p[worker]->enabled = 0;
enabled_workers--;
}
}
/**
* Enable worker <worker>.
* If the given worker is the current worker, this function does nothing.
*/
if (worker == self) return;
if (workers_p[worker]->enabled == 0) {
workers_p[worker]->enabled = 1;
enabled_workers++;
}
}
/**
* Enables all workers 0..(N-1) and disables workers N..max.
* This function _should_ be called by worker 0.
* Ignores the current worker if >= N.
* The number of workers is never reduces below 1.
*/
if (workercount < 1) workercount = 1;
if (workercount > n_workers) workercount = n_workers;
enabled_workers = workercount;
workers_p[i]->enabled = (i < workercount || i == self) ? 1 : 0;
}
}
/**
* Get the number of currently enabled workers.
*/
unsigned int
lace_enabled_workers()
{
return enabled_workers;
}
/**
* Simple random number generated (like rand) using the given seed.
* (Used for thread-specific (scalable) random number generation.
*/
static inline uint32_t
rng(uint32_t *seed, int max)
{
uint32_t next = *seed;
next *= 1103515245;
next += 12345;
*seed = next;
return next % max;
}
/**
* (Try to) steal and execute a task from a random worker.
*/
VOID_TASK_0(lace_steal_random)
{
Worker *victim = workers[(__lace_worker->worker + 1 + rng(&__lace_worker->seed, n_workers-1)) % n_workers];
YIELD_NEWFRAME();
PR_COUNTSTEALS(__lace_worker, CTR_steal_tries);
Worker *res = lace_steal(__lace_worker, __lace_dq_head, victim);
if (res == LACE_STOLEN) {
PR_COUNTSTEALS(__lace_worker, CTR_steals);
} else if (res == LACE_BUSY) {
PR_COUNTSTEALS(__lace_worker, CTR_steal_busy);
}
}
lace_init_worker(0);
lace_pin_worker();
LACE_ME;
WRAP(main_cb, arg);
// Now signal that we're done
pthread_mutex_lock(&wait_until_done_mutex);
/**
* Main Lace worker implementation.
* Steal from random victims until "quit" is set.
*/
VOID_TASK_1(lace_steal_loop, int*, quit)
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
{
// Determine who I am
const int worker_id = __lace_worker->worker;
// Prepare self, victim
Worker ** const self = &workers[worker_id];
Worker **victim = self;
#if LACE_PIE_TIMES
__lace_worker->time = gethrtime();
#endif
uint32_t seed = worker_id;
unsigned int n = n_workers;
int i=0;
while(*(volatile int*)quit == 0) {
// Select victim
if( i>0 ) {
i--;
victim++;
if (victim == self) victim++;
if (victim >= workers + n) victim = workers;
if (victim == self) victim++;
} else {
i = rng(&seed, 40); // compute random i 0..40
victim = workers + (rng(&seed, n-1) + worker_id + 1) % n;
}
PR_COUNTSTEALS(__lace_worker, CTR_steal_tries);
Worker *res = lace_steal(__lace_worker, __lace_dq_head, *victim);
if (res == LACE_STOLEN) {
PR_COUNTSTEALS(__lace_worker, CTR_steals);
} else if (res == LACE_BUSY) {
PR_COUNTSTEALS(__lace_worker, CTR_steal_busy);
}
YIELD_NEWFRAME();
if (must_suspend) {
lace_barrier();
do {
pthread_barrier_wait(&suspend_barrier);
} while (__lace_worker->enabled == 0);
}
}
}
/**
* Initialize worker 0.
* Calls lace_init_worker and then signals the event.
*/
void
lace_init_main()
{
lace_init_worker(0);
}
/**
* Initialize the current thread as a Lace thread, and perform work-stealing
* as worker <worker> until lace_exit() is called.
*
* For worker 0, use lace_init_main
*/
void
lace_run_worker(void)
// Run the steal loop
LACE_ME;
CALL(lace_steal_loop, &lace_quits);
// Time worker exit event
}
static void*
lace_default_worker_thread(void* arg)
{
int worker = (int)(size_t)arg;
lace_init_worker(worker);
lace_pin_worker();
lace_run_worker();
return NULL;
}
pthread_t
lace_spawn_worker(int worker, size_t stacksize, void* (*fun)(void*), void* arg)
{
// Determine stack size
if (stacksize == 0) stacksize = default_stacksize;
size_t pagesize = sysconf(_SC_PAGESIZE);
stacksize = (stacksize + pagesize - 1) & ~(pagesize - 1); // ceil(stacksize, pagesize)
// Get our logical processor
hwloc_obj_t pu = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, worker % n_pus);
// Allocate memory for the program stack
void *stack_location = hwloc_alloc_membind(topo, stacksize + pagesize, pu->cpuset, HWLOC_MEMBIND_BIND, 0);
if (stack_location == 0) {
fprintf(stderr, "Lace error: Unable to allocate memory for the pthread stack!\n");
exit(1);
}
#else
void *stack_location = mmap(NULL, stacksize+ pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
#endif
if (0 != mprotect(stack_location, pagesize, PROT_NONE)) {
fprintf(stderr, "Lace error: Unable to protect the allocated program stack with a guard page!\n");
exit(1);
}
stack_location = (uint8_t *)stack_location + pagesize; // skip protected page.
if (0 != pthread_attr_setstack(&worker_attr, stack_location, stacksize)) {
fprintf(stderr, "Lace error: Unable to set the pthread stack in Lace!\n");
exit(1);
}
workers_init[worker].stack = stack_location;
workers_init[worker].stacksize = stacksize;
if (fun == 0) {
arg = (void*)(size_t)worker;
}
pthread_t res;
pthread_create(&res, &worker_attr, fun, arg);
return res;
}
void
lace_set_verbosity(int level)
{
verbosity = level;
}
/**
* Initialize Lace for work-stealing with <n> workers, where
* each worker gets a task deque with <dqsize> elements.
*/
#if LACE_USE_HWLOC
// Initialize topology and information about cpus
hwloc_topology_init(&topo);
hwloc_topology_load(topo);
n_nodes = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_NODE);
n_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
n_pus = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
#elif defined(sched_getaffinity)
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
unsigned int n_pus = CPU_COUNT(&cs);
#else
unsigned int n_pus = sysconf(_SC_NPROCESSORS_ONLN);
enabled_workers = n_workers;
if (dqsize != 0) default_dqsize = dqsize;
lace_barrier_init();
// Create suspend barrier
pthread_barrier_init(&suspend_barrier, NULL, n_workers);
// Allocate array with all workers
if (posix_memalign((void**)&workers, LINE_SIZE, n_workers*sizeof(Worker*)) != 0 ||
posix_memalign((void**)&workers_p, LINE_SIZE, n_workers*sizeof(WorkerP*)) != 0 ||
posix_memalign((void**)&workers_memory, LINE_SIZE, n_workers*sizeof(worker_data*)) != 0) {
fprintf(stderr, "Lace error: unable to allocate memory!\n");
exit(1);
}
// Compute memory size for each worker
workers_memory_size = sizeof(worker_data) + sizeof(Task) * dqsize;
// Create pthread key
#ifndef __linux__
pthread_key_create(&worker_key, NULL);
#endif
// Prepare structures for thread creation
pthread_attr_init(&worker_attr);
// Set contention scope to system (instead of process)
pthread_attr_setscope(&worker_attr, PTHREAD_SCOPE_SYSTEM);
// Get default stack size
if (pthread_attr_getstacksize(&worker_attr, &default_stacksize) != 0) {
fprintf(stderr, "Lace warning: pthread_attr_getstacksize returned error!\n");
default_stacksize = 1048576; // 1 megabyte default
}
if (verbosity) {
fprintf(stderr, "Initializing Lace, %u nodes, %u cores, %u logical processors, %d workers.\n", n_nodes, n_cores, n_pus, n_workers);
#else
fprintf(stderr, "Initializing Lace, %u available cores, %d workers.\n", n_pus, n_workers);
#endif
}
// Prepare lace_init structure
workers_init = (struct lace_worker_init*)calloc(1, sizeof(struct lace_worker_init) * n_workers);
lace_newframe.t = NULL;
#if LACE_PIE_TIMES
// Initialize counters for pie times
us_elapsed_start();
count_at_start = gethrtime();
#endif
}
/**
* Start the worker threads.
* If cb is set, then the current thread is suspended and Worker 0 is a new thread that starts with
* the given cb(arg) as the root task.
* If cb is not set, then the current thread is Worker 0 and this function returns.
*/
void
lace_startup(size_t stacksize, lace_startup_cb cb, void *arg)
{
if (stacksize == 0) stacksize = default_stacksize;
if (verbosity) {
if (cb != 0) {
fprintf(stderr, "Lace startup, creating %d worker threads with program stack %zu bytes.\n", n_workers, stacksize);
} else if (n_workers == 1) {
fprintf(stderr, "Lace startup, creating 0 worker threads.\n");
} else {
fprintf(stderr, "Lace startup, creating %d worker threads with program stack %zu bytes.\n", n_workers-1, stacksize);
}
}
/* Spawn all other workers */
for (unsigned int i=1; i<n_workers; i++) lace_spawn_worker(i, stacksize, 0, 0);
main_cb = cb;
lace_spawn_worker(0, stacksize, lace_main_wrapper, arg);
if (lace_quits == 0) pthread_cond_wait(&wait_until_done, &wait_until_done_mutex);
/* If cb not set, use current thread as worker 0 */
lace_init_worker(0);
}
}
#if LACE_COUNT_EVENTS
static uint64_t ctr_all[CTR_MAX];
#endif
void
lace_count_reset()
{
#if LACE_COUNT_EVENTS
int i;
size_t j;
for (i=0;i<n_workers;i++) {
for (j=0;j<CTR_MAX;j++) {
workers_p[i]->ctr[j] = 0;
}
}
#if LACE_PIE_TIMES
for (i=0;i<n_workers;i++) {
workers_p[i]->time = gethrtime();
if (i != 0) workers_p[i]->level = 0;
}
us_elapsed_start();
count_at_start = gethrtime();
#endif
#endif
}
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
void
lace_count_report_file(FILE *file)
{
#if LACE_COUNT_EVENTS
int i;
size_t j;
for (j=0;j<CTR_MAX;j++) ctr_all[j] = 0;
for (i=0;i<n_workers;i++) {
uint64_t *wctr = workers_p[i]->ctr;
for (j=0;j<CTR_MAX;j++) {
ctr_all[j] += wctr[j];
}
}
#if LACE_COUNT_TASKS
for (i=0;i<n_workers;i++) {
fprintf(file, "Tasks (%d): %zu\n", i, workers_p[i]->ctr[CTR_tasks]);
}
fprintf(file, "Tasks (sum): %zu\n", ctr_all[CTR_tasks]);
fprintf(file, "\n");
#endif
#if LACE_COUNT_STEALS
for (i=0;i<n_workers;i++) {
fprintf(file, "Steals (%d): %zu good/%zu busy of %zu tries; leaps: %zu good/%zu busy of %zu tries\n", i,