TLS variable in Linux

TLS就是thread local storage，即线程局部存储，其含义很很直白，每个线程都能访问该变量，但实际访问的却是不同的地址（有点类似于内核里面的PERCPU variable）。Glibc中的errno就是一个TLS变量，gcc也提供了__thread关键字扩展，可以将一个全局变量申明为TLS。

写一段很简单的程序来验证下：

__thread int test;
int test2;

int main(int argc, char **argv) {
    test++;
    test2++;
    return 0;
}

反汇编后如下：

0x00000000004004ec <+0>:     push   %rbp
0x00000000004004ed <+1>:     mov    %rsp,%rbp
0x00000000004004f0 <+4>:     mov    %edi,-0x4(%rbp)
0x00000000004004f3 <+7>:     mov    %rsi,-0x10(%rbp)
0x00000000004004f7 <+11>:    mov    %fs:0xfffffffffffffffc,%eax
0x00000000004004ff <+19>:    add    $0x1,%eax
0x0000000000400502 <+22>:    mov    %eax,%fs:0xfffffffffffffffc
0x000000000040050a <+30>:    mov    0x2003c4(%rip),%eax        # 0x6008d4 <test2>
0x0000000000400510 <+36>:    add    $0x1,%eax
0x0000000000400513 <+39>:    mov    %eax,0x2003bb(%rip)        # 0x6008d4 <test2>
0x0000000000400519 <+45>:    mov    $0x0,%eax
0x000000000040051e <+50>:    pop    %rbp
0x000000000040051f <+51>:    retq

可以看到，一个普通的全局变量是通过%rip加上一个偏移来访问的，而一个TLS变量则是通过%fs寄存器去访问。而且，这里有一处非常奇怪的地方，%fs寄存器的偏移竟然是负数？虽然x86_64当中的segment是不做limit checking的，但还是想不通这么做是什么缘由。难道这个用作TLS的segment是expand-down的？

......
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f44eb90e000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f44eb90d000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f44eb90c000
arch_prctl(ARCH_SET_FS, 0x7f44eb90d700) = 0
......

用strace观察该程序的行为，发现glibc runtime在初始化的时候调用了arch_prctl，这是一个系统调用，并且传入了参数ARCH_SET_FS，用来设置segment base，主线程的TLS段就是在这里设置。来看看arch_prctl都干了哪些事情。

long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
	int ret = 0;
	int doit = task == current;
	int cpu;

	switch (code) {
	case ARCH_SET_FS:
		/* Not strictly needed for fs, but do it for symmetry
		   with gs */
		if (addr >= TASK_SIZE_OF(task))
			return -EPERM;
		cpu = get_cpu();
		/* handle small bases via the GDT because that's faster to
		   switch. */
		if (addr <= 0xffffffff) {
			set_32bit_tls(task, FS_TLS, addr);
			if (doit) {
				load_TLS(&task->thread, cpu);
				loadsegment(fs, FS_TLS_SEL);
			}
			task->thread.fsindex = FS_TLS_SEL;
			task->thread.fs = 0;
		} else {
			task->thread.fsindex = 0;
			task->thread.fs = addr;
			if (doit) {
				/* set the selector to 0 to not confuse
				   __switch_to */
				loadsegment(fs, 0);
				ret = wrmsrl_safe(MSR_FS_BASE, addr);
			}
		}
		put_cpu();
		break;
	}
	return ret;
}

long sys_arch_prctl(int code, unsigned long addr)
{
	return do_arch_prctl(current, code, addr);
}

先检查addr有没有越界，然后关抢占，在x86_64上应该执行判断语句的else分支，将%fs的信息写入thread_struct，最后写%fs，写segment base MSR。

如果新创建了一个线程，会发生什么情况呢？继续strace，发现如下内容。

mmap(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f12b0acb000
brk(0)                                  = 0x1e2e000
brk(0x1e4f000)                          = 0x1e4f000
mprotect(0x7f12b0acb000, 4096, PROT_NONE) = 0
clone(Process 6467 attached (waiting for parent)
Process 6467 resumed (parent 6466 ready)
child_stack=0x7f12b12caff0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f12b12cb9d0, tls=0x7f12b12cb700, child_tidptr=0x7f12b12cb9d0) = 6467

父线程在做clone的时候，会把子线程的TLS段相关参数给传进去，随后do_fork -> copy_process -> copy_thread，最后有如下代码：

/*
 * Set a new TLS for the child thread?
 */
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
    if (test_thread_flag(TIF_IA32))
        err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0);
    else
#endif
    err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
    if (err)
        goto out;
}

可以看到最后还是会调到do_arch_prctl，并且TLS段的基地址是放在%r8寄存器中的。最后再来看看线程切换的时候会发生什么。

__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p) {
    ......
    savesegment(fs, fsindex);
    savesegment(gs, gsindex);
    
    load_TLS(next, cpu);
    
    /*
     * Leave lazy mode, flushing any hypercalls made here.
     * This must be done before restoring TLS segments so
     * the GDT and LDT are properly updated, and must be
     * done before math_state_restore, so the TS bit is up
     * to date.
     */
    arch_end_context_switch(next_p);
    
    /*
     * Switch FS and GS.
     *
     * Segment register != 0 always requires a reload.  Also
     * reload when it has changed.  When prev process used 64bit
     * base always reload to avoid an information leak.
     */
    if (unlikely(fsindex | next->fsindex | prev->fs)) {
        loadsegment(fs, next->fsindex);
        /*
        * Check if the user used a selector != 0; if yes
        *  clear 64bit base, since overloaded base is always
        *  mapped to the Null selector
        */
        if (fsindex)
            prev->fs = 0;
    }
    /* when next process has a 64bit base use it */
    if (next->fs)
        wrmsrl(MSR_FS_BASE, next->fs);
    prev->fsindex = fsindex;
    ......
}

可以看到，线程切换时在此处做了TLS段的切换，调用关系如下schedule -> __schedule -> context_switch -> switch_to -> __switch_to。

此外，TLS变量还可以动态创建和使用，pthread库提供了pthread_key_create，pthread_key_delete，pthread_setspecific和pthread_getspecific等函数，此处就不再赘述了。

总结一句话，TLS变量的使用需要用户程序，编译器，C runtime和OS多方的配合才能实现。

blog comments powered by Disqus

Published

07 December 2013

TLS variable in Linux

Published

Tags