Pthread mutex与GCC atomic性能测试

问题背景

之前对这一块只有一个初步的理解，阅读过一些文章，大概知道pthread中mutex一次加锁时间在10~100ns之间，直觉上认为这个效率非常高了，最近做MySQL高并发优化工作后，发现自己知道的太少，以致浪费了不少时间，遂决定通过测试总结一下最近学习到的东西

mutex的主要作用是保证原子性操作，pthread实现提供的一套api非常简洁，对于一般的应用肯定是够了，但是对于Database这种对性能有变态要求的系统，尤其是在高并发下，需要进一步优化，这时需要用到gcc提供的__sync_xxx_and_fetch系列函数了，具体参考：GCC Atomic Memory Access and built-in functions，这系列函数使用内存屏障保证对内存单元的原子操作，性能较pthread好，本文通过一系列测试得出定量结论

测试场景

四个基本场景，通过不同线程数得到多组测试数据
场景一：pthread_mutex_lock/pthread_mutex_unlock之间不执行任何代码
场景二：pthread_mutex_lock/pthread_mutex_unlock之间执行一次sum++操作
场景三：pthread_mutex_lock/pthread_mutex_unlock之间执行多句代码
场景四：使用__sync_add_and_fetch原子执行sum++操作

硬件环境

Intel(R) Xeon(R) CPU E5420 @ 2.50GHz

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
On-line CPU(s) list:   0-7
Thread(s) per core:    1
Core(s) per socket:    4
CPU socket(s):         2
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 23
Stepping:              10
CPU MHz:               2493.847
BogoMIPS:              4987.52
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              6144K
NUMA node0 CPU(s):     0-7

测试代码

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
 
#define MAX_THREADS 10000
 
long sum = 0;
long sum1 = 0;
 
pthread_barrier_t barr;
pthread_mutex_t   lock = PTHREAD_MUTEX_INITIALIZER;
 
void *thread_add_func0(void * param)
{
  pthread_barrier_wait(&barr);
  int i;
  for (i = 0; i < *((int*)param); i++)
  { 
    pthread_mutex_lock(&lock);
    pthread_mutex_unlock(&lock);
  }
}
 
void *thread_add_func1(void * param)
{
  pthread_barrier_wait(&barr);
  int i;
  for (i = 0; i < *((int*)param); i++)
  { 
    pthread_mutex_lock(&lock);
    sum++;
    pthread_mutex_unlock(&lock);
  }
}
 
void *thread_add_func2(void * param)
{
  pthread_barrier_wait(&barr);
  int i;
  int j = 0;
  for (i = 0; i < *((int*)param); i++)
  { 
    pthread_mutex_lock(&lock);
    if (j == 0)
    {
      sum += 3;
      j = 1;
    } 
    else
    {
      sum -= 1;
      j = 0;
    }
    pthread_mutex_unlock(&lock);
  }
}
 
void *thread_add_func3(void * param)
{
  pthread_barrier_wait(&barr);
  int i;
  for (i = 0; i < *((int*)param); i++)
    __sync_add_and_fetch(&sum, 1);
}
 
void test_func(void* (*func)(void *), int threads, int loop, const char *name)
{
  pthread_barrier_init(&barr, NULL, threads + 1);
 
  pthread_t tids[MAX_THREADS];
 
  int i;
  sum = 0;
 
  for (i = 0; i < threads; i++)
    pthread_create(&tids[i], NULL, func, (void*)&loop);
 
  pthread_barrier_wait(&barr);
  time_t start = time(NULL);
 
  for (i = 0; i < threads; i++)
    pthread_join(tids[i], NULL);
 
  printf("%s:\tsum: %d, time: %d seconds\n", 
         name, sum, time(NULL) - start);
}
 
int main(int argc, char *argv[])
{
  if (argc < 2)
    return 1;
 
  int threads = atoi(argv[1]);
 
  if (threads <=0 || threads > MAX_THREADS)
    return 1;
 
  int loop = 1000000000 / threads;
 
  printf("threads: %d, loops per thread: %d\n", threads, loop);
 
  test_func(thread_add_func0, threads, loop, "mutex-void");
  test_func(thread_add_func1, threads, loop, "mutex-sum");
  test_func(thread_add_func2, threads, loop, "mutex-sum-2");
  test_func(thread_add_func3, threads, loop, "gcc atomic add");
 
  printf("\n");
 
  return 0;
}

#include <stdio.h> #include <stdlib.h> #include <pthread.h> #include <time.h> #define MAX_THREADS 10000 long sum = 0; long sum1 = 0; pthread_barrier_t barr; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; void *thread_add_func0(void * param) { pthread_barrier_wait(&barr); int i; for (i = 0; i < *((int*)param); i++) { pthread_mutex_lock(&lock); pthread_mutex_unlock(&lock); } } void *thread_add_func1(void * param) { pthread_barrier_wait(&barr); int i; for (i = 0; i < *((int*)param); i++) { pthread_mutex_lock(&lock); sum++; pthread_mutex_unlock(&lock); } } void *thread_add_func2(void * param) { pthread_barrier_wait(&barr); int i; int j = 0; for (i = 0; i < *((int*)param); i++) { pthread_mutex_lock(&lock); if (j == 0) { sum += 3; j = 1; } else { sum -= 1; j = 0; } pthread_mutex_unlock(&lock); } } void *thread_add_func3(void * param) { pthread_barrier_wait(&barr); int i; for (i = 0; i < *((int*)param); i++) __sync_add_and_fetch(&sum, 1); } void test_func(void* (*func)(void *), int threads, int loop, const char *name) { pthread_barrier_init(&barr, NULL, threads + 1); pthread_t tids[MAX_THREADS]; int i; sum = 0; for (i = 0; i < threads; i++) pthread_create(&tids[i], NULL, func, (void*)&loop); pthread_barrier_wait(&barr); time_t start = time(NULL); for (i = 0; i < threads; i++) pthread_join(tids[i], NULL); printf("%s:\tsum: %d, time: %d seconds\n", name, sum, time(NULL) - start); } int main(int argc, char *argv[]) { if (argc < 2) return 1; int threads = atoi(argv[1]); if (threads <=0 || threads > MAX_THREADS) return 1; int loop = 1000000000 / threads; printf("threads: %d, loops per thread: %d\n", threads, loop); test_func(thread_add_func0, threads, loop, "mutex-void"); test_func(thread_add_func1, threads, loop, "mutex-sum"); test_func(thread_add_func2, threads, loop, "mutex-sum-2"); test_func(thread_add_func3, threads, loop, "gcc atomic add"); printf("\n"); return 0; }

测试结果

./a.out 1 && ./a.out 10 && ./a.out 100 && ./a.out 1000 && ./a.out 10000
threads: 1, loops per thread: 1000000000
mutex-void:     sum: 0, time: 38 seconds
mutex-sum:      sum: 1000000000, time: 39 seconds
mutex-sum-2:    sum: 1000000000, time: 39 seconds
gcc atomic add: sum: 1000000000, time: 16 seconds
 
threads: 10, loops per thread: 100000000
mutex-void:     sum: 0, time: 115 seconds
mutex-sum:      sum: 1000000000, time: 118 seconds
mutex-sum-2:    sum: 1000000000, time: 122 seconds
gcc atomic add: sum: 1000000000, time: 28 seconds
 
threads: 100, loops per thread: 10000000
mutex-void:     sum: 0, time: 104 seconds
mutex-sum:      sum: 1000000000, time: 107 seconds
mutex-sum-2:    sum: 1000000000, time: 110 seconds
gcc atomic add: sum: 1000000000, time: 27 seconds
 
threads: 1000, loops per thread: 1000000
mutex-void:     sum: 0, time: 102 seconds
mutex-sum:      sum: 1000000000, time: 105 seconds
mutex-sum-2:    sum: 1000000000, time: 109 seconds
gcc atomic add: sum: 1000000000, time: 27 seconds
 
threads: 10000, loops per thread: 100000
mutex-void:     sum: 0, time: 94 seconds
mutex-sum:      sum: 1000000000, time: 96 seconds
mutex-sum-2:    sum: 1000000000, time: 101 seconds
gcc atomic add: sum: 1000000000, time: 27 seconds

结果分析

1. 单线程状态下pthread库一次lock+unlock大概消耗40ns
2. 除了单线程外，其它三组数据基本相同，这说明pthread库在并发增加时并不会性能急剧下降，一次lock+unlock消耗100ns左右
3. mutex-void/mutex-sum/mutex-sum-2结果对边可以看出，内存存取指令消耗时间大概是pthread一次lock+unlock的1/30
4. GCC atomic操作效率非常高，尤其是在并发下环境下，性能大概是pthread库的4~5倍

Pthread mutex与GCC atomic性能测试

问题背景

测试场景

硬件环境

测试代码

测试结果

结果分析

相关

发表回复取消回复

问题背景

测试场景

硬件环境

测试代码

测试结果

结果分析

相关

发表回复 取消回复

发表回复取消回复