R&D/DPDK

malloc/free performance

sunshout 2015. 4. 17. 20:45

DPDK  에서 rte_malloc 은 기존 malloc 대비 상당히 성능 저하가 존재한다.


실험:

1024 Bytes 크기의 memory 를 10,000,000 번 malloc 과 free 를 반복 수행하여 평균값을 구하였다.




DPDK 성능 저하 원인:

ㅇ rte_malloc 은 malloc 시 lock spin_lock을 잡고 진행되는데 thread의 개수가 증가하면 lock contention 이 발생하고 성능 저하가 linear 하게 증가하게 되는거 같다.


실험환경

Ubuntu 14.04, x86_64

CPU: Intel(R) Atom(TM) CPU  C2758  @ 2.40GHz

Memory: 32GB

소스코드


#include <stdio.h>

#include <string.h>

#include <stdint.h>

#include <errno.h>

#include <sys/queue.h>

#include <stdlib.h>

#include <sys/time.h>

#include <unistd.h>


#include <rte_memory.h>

#include <rte_memzone.h>

#include <rte_launch.h>

#include <rte_eal.h>

#include <rte_per_lcore.h>

#include <rte_lcore.h>

#include <rte_debug.h>

#include <rte_malloc.h>


#define USECSPERSEC 1000000


static uint64_t size = 1024;

static uint64_t iteration_count = 10000000;


void *dummy (uint64_t i)

{

        return NULL;

}


static int

lcore_malloc(__attribute__((unused)) void *arg)

{

        register uint64_t i;

        register uint64_t request_size = size;

        register uint64_t total_iterations = iteration_count;

        struct timeval start, end, null, elapsed, adjusted;



        uint64_t lcore_id;

        lcore_id = rte_lcore_id();

        printf("hello from core %lu\n", lcore_id);

        /*

         * Time a null loop.  We'll subtract this from the final

         * malloc loop results to get a more accurate value.

         */

        gettimeofday(&start, NULL);


        for (i = 0; i < total_iterations; i++) {

                register void * buf;

                buf = dummy(i);

                buf = dummy(i);

        }


        gettimeofday(&end, NULL);


        null.tv_sec = end.tv_sec - start.tv_sec;

        null.tv_usec = end.tv_usec - start.tv_usec;

        if (null.tv_usec < 0) {

                null.tv_sec--;

                null.tv_usec += USECSPERSEC;

        }


        /*

         * Run the real malloc test

         */

        gettimeofday(&start, NULL);


        for (i = 0; i < total_iterations; i++) {

                register void * buf;

                buf = malloc(request_size);

                free(buf);

                //buf = rte_malloc("rte",request_size, RTE_CACHE_LINE_SIZE);

                //rte_free(buf);

        }


        gettimeofday(&end, NULL);


        elapsed.tv_sec = end.tv_sec - start.tv_sec;

        elapsed.tv_usec = end.tv_usec - start.tv_usec;

        if (elapsed.tv_usec < 0) {

                elapsed.tv_sec--;

                elapsed.tv_usec += USECSPERSEC;

        }


        /*

         * Adjust elapsed time by null loop time

         */

        adjusted.tv_sec = elapsed.tv_sec - null.tv_sec;

        adjusted.tv_usec = elapsed.tv_usec - null.tv_usec;

        if (adjusted.tv_usec < 0) {

                adjusted.tv_sec--;

                adjusted.tv_usec += USECSPERSEC;

        }

        printf("Thread %lu adjusted timing: %d.%06d seconds for %lu requests"

                " of %lu bytes.\n", pthread_self(),

                adjusted.tv_sec, adjusted.tv_usec, total_iterations,

                request_size);


        return 0;

}


int

main(int argc, char **argv)

{

        int ret;

        uint64_t lcore_id;


        ret = rte_eal_init(argc, argv);

        if (ret < 0)

                rte_panic("Cannot init EAL\n");


        /* call lcore_malloc() on every slave lcore */

        RTE_LCORE_FOREACH_SLAVE(lcore_id) {

                rte_eal_remote_launch(lcore_malloc, NULL, lcore_id);

        }


        /* call it on master lcore too */

        lcore_malloc(NULL);


        rte_eal_mp_wait_lcore();

        return 0;

}



Makefile


ifeq ($(RTE_SDK),)

$(error "Please define RTE_SDK environment variable")

endif


# Default target, can be overriden by command line or environment

RTE_TARGET ?= x86_64-native-linuxapp-gcc


include $(RTE_SDK)/mk/rte.vars.mk


# binary name

APP = malloc_test


# all source are stored in SRCS-y

SRCS-y := main.c


CFLAGS += -g

#CFLAGS += $(WERROR_FLAGS)


include $(RTE_SDK)/mk/rte.extapp.mk




실험 명령어

./build/malloc_test -c f -n 3

./build/malloc_test -c 1 -n 3

./build/malloc_test -c 3 -n 3

./build/malloc_test -c 0x7 -n 3

./build/malloc_test -c 0xf -n 3

./build/malloc_test -c 0x1f -n 3

./build/malloc_test -c 0x3f -n 3

./build/malloc_test -c 0x7f -n 3

./build/malloc_test -c 0xff -n 3