Sometimes it's required to have large thread local data array. Thread stacks may be limited in size, and they could not provide enough flexibility or speed. In this case one could use
%GS
register of amd64 CPUs as base register, and rely on GS-relative addressing mode of 64-bit AMD processors.
Following example allows to create up to 32GB per thread data storage, and access any element with single instruction.
#include <stdio.h> #include <asm/prctl.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <unistd.h> void set_base(void* base) { if (arch_prctl(ARCH_SET_GS, base) < 0) { perror("prctl set"); } } inline int get_at_base(int handle, int idx) { int rv; __asm__ __volatile__ ( "movl %%gs:(%2,%1,8), %0" : "=r" (rv) : "r"(handle), "r"(idx) ); return rv; } inline int put_at_base(int handle, int idx, int val) { int rv; __asm__ __volatile__ ( "movl %2, %%gs:(%1,%0,8)" : : "r"(handle), "r"(idx), "r"(val) ); return rv; } void* runner(void* arg) { int i; int ps = getpagesize(); int size = ps*10; long* base = (long*)((long)((char*)malloc(size)+ps) & ~(ps-1)); set_base(base); for (i=0; i<100; i++) { put_at_base(i, 1, i); } for (i=0; i<100; i++) { int v = get_at_base(i, 1); if (v != i) { printf("bug %d %d\n", i, v); } } printf("OK: %p\n", base); return NULL; } int main() { #define NT 10 pthread_t threads[NT]; int i; for (i=0; i<NT; i++) { pthread_create(threads+i, NULL, runner, NULL); } for (i=0; i<NT; i++) { pthread_join(threads[i], NULL); } return 0; }