DPDK 에서 rte_malloc 은 기존 malloc 대비 상당히 성능 저하가 존재한다.
실험:
1024 Bytes 크기의 memory 를 10,000,000 번 malloc 과 free 를 반복 수행하여 평균값을 구하였다.
DPDK 성능 저하 원인:
ㅇ rte_malloc 은 malloc 시 lock spin_lock을 잡고 진행되는데 thread의 개수가 증가하면 lock contention 이 발생하고 성능 저하가 linear 하게 증가하게 되는거 같다.
실험환경
Ubuntu 14.04, x86_64
CPU: Intel(R) Atom(TM) CPU C2758 @ 2.40GHz
소스코드
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <errno.h>
#include <sys/queue.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <rte_memory.h>
#include <rte_memzone.h>
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_per_lcore.h>
#include <rte_lcore.h>
#include <rte_debug.h>
#include <rte_malloc.h>
#define USECSPERSEC 1000000
static uint64_t size = 1024;
static uint64_t iteration_count = 10000000;
void *dummy (uint64_t i)
{
return NULL;
}
static int
lcore_malloc(__attribute__((unused)) void *arg)
{
register uint64_t i;
register uint64_t request_size = size;
register uint64_t total_iterations = iteration_count;
struct timeval start, end, null, elapsed, adjusted;
uint64_t lcore_id;
lcore_id = rte_lcore_id();
printf("hello from core %lu\n", lcore_id);
/*
* Time a null loop. We'll subtract this from the final
* malloc loop results to get a more accurate value.
*/
gettimeofday(&start, NULL);
for (i = 0; i < total_iterations; i++) {
register void * buf;
buf = dummy(i);
buf = dummy(i);
}
gettimeofday(&end, NULL);
null.tv_sec = end.tv_sec - start.tv_sec;
null.tv_usec = end.tv_usec - start.tv_usec;
if (null.tv_usec < 0) {
null.tv_sec--;
null.tv_usec += USECSPERSEC;
}
/*
* Run the real malloc test
*/
gettimeofday(&start, NULL);
for (i = 0; i < total_iterations; i++) {
register void * buf;
buf = malloc(request_size);
free(buf);
//buf = rte_malloc("rte",request_size, RTE_CACHE_LINE_SIZE);
//rte_free(buf);
}
gettimeofday(&end, NULL);
elapsed.tv_sec = end.tv_sec - start.tv_sec;
elapsed.tv_usec = end.tv_usec - start.tv_usec;
if (elapsed.tv_usec < 0) {
elapsed.tv_sec--;
elapsed.tv_usec += USECSPERSEC;
}
/*
* Adjust elapsed time by null loop time
*/
adjusted.tv_sec = elapsed.tv_sec - null.tv_sec;
adjusted.tv_usec = elapsed.tv_usec - null.tv_usec;
if (adjusted.tv_usec < 0) {
adjusted.tv_sec--;
adjusted.tv_usec += USECSPERSEC;
}
printf("Thread %lu adjusted timing: %d.%06d seconds for %lu requests"
" of %lu bytes.\n", pthread_self(),
adjusted.tv_sec, adjusted.tv_usec, total_iterations,
request_size);
return 0;
}
int
main(int argc, char **argv)
{
int ret;
uint64_t lcore_id;
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_panic("Cannot init EAL\n");
/* call lcore_malloc() on every slave lcore */
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
rte_eal_remote_launch(lcore_malloc, NULL, lcore_id);
}
/* call it on master lcore too */
lcore_malloc(NULL);
rte_eal_mp_wait_lcore();
return 0;
}
Makefile
ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif
# Default target, can be overriden by command line or environment
RTE_TARGET ?= x86_64-native-linuxapp-gcc
include $(RTE_SDK)/mk/rte.vars.mk
# binary name
APP = malloc_test
# all source are stored in SRCS-y
SRCS-y := main.c
CFLAGS += -g
#CFLAGS += $(WERROR_FLAGS)
include $(RTE_SDK)/mk/rte.extapp.mk
실험 명령어
./build/malloc_test -c f -n 3
./build/malloc_test -c 1 -n 3
./build/malloc_test -c 3 -n 3
./build/malloc_test -c 0x7 -n 3
./build/malloc_test -c 0xf -n 3
./build/malloc_test -c 0x1f -n 3
./build/malloc_test -c 0x3f -n 3
./build/malloc_test -c 0x7f -n 3
./build/malloc_test -c 0xff -n 3