dnn_backend_native_layer_conv2d.c:Add mutithread function

Use pthread to multithread dnn_execute_layer_conv2d. Can be tested with command "./ffmpeg_g -i input.png -vf \ format=yuvj420p,dnn_processing=dnn_backend=native:model= \ espcn.model:input=x:output=y:options=conv2d_threads=23 \ -y sr_native.jpg -benchmark" before patch: utime=11.238s stime=0.005s rtime=11.248s after patch: utime=20.817s stime=0.047s rtime=1.051s on my 3900X 12c24t @4.2GHz About the increase of utime, it's because that CPU HyperThreading technology makes logical cores twice of physical cores while cpu's counting performance improves less than double. And utime sums all cpu's logical cores' runtime. As a result, using threads num near cpu's logical core's number will double utime, while reduce rtime less than half for HyperThreading CPUs. Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> Signed-off-by: Guo, Yejun <yejun.guo@intel.com>
2020-09-06 20:28:53 +08:00
parent 235e01f5a0
commit 3c7cad69f2
2 changed files with 108 additions and 13 deletions
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -19,10 +19,27 @@
 */
 #include "libavutil/avassert.h"
 #include "libavutil/thread.h"
 #include "libavutil/cpu.h"
 #include "dnn_backend_native_layer_conv2d.h"
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
 //struct to pass parameters
 typedef struct thread_common_param{
    DnnOperand *operands;
    const int32_t *input_operand_indexes;
    int32_t output_operand_index;
    const void *parameters;
    NativeContext *ctx;
    int thread_num;
 } thread_common_param;
 typedef struct thread_param{
    thread_common_param *thread_common_param;
    int thread_index;
 } thread_param;
 int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
 {
    ConvolutionalParams *conv_params;
@@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
    return dnn_size;
 }
-int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
+static void * dnn_execute_layer_conv2d_thread(void *threadarg)
                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
 {
    //pass parameters
    thread_param *thread_param = (struct thread_param *)threadarg;
    thread_common_param *thread_common_param = thread_param->thread_common_param;
    DnnOperand *operands = thread_common_param->operands;
    float *output;
-    int32_t input_operand_index = input_operand_indexes[0];
+    int32_t input_operand_index = thread_common_param->input_operand_indexes[0];
    int number = operands[input_operand_index].dims[0];
    int height = operands[input_operand_index].dims[1];
    int width = operands[input_operand_index].dims[2];
    int channel = operands[input_operand_index].dims[3];
    const float *input = operands[input_operand_index].data;
-    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters;
+    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters);
    int radius = conv_params->kernel_size >> 1;
    int src_linesize = width * conv_params->input_num;
@@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
    int filter_size = conv_params->kernel_size * filter_linesize;
    int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
-    DnnOperand *output_operand = &operands[output_operand_index];
+    int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num;
    int thread_start = thread_stride * thread_param->thread_index + pad_size;
    int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride);
    DnnOperand *output_operand = &operands[thread_common_param->output_operand_index];
    output_operand->dims[0] = number;
    output_operand->dims[1] = height - pad_size * 2;
    output_operand->dims[2] = width - pad_size * 2;
@@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
    output_operand->data_type = operands[input_operand_index].data_type;
    output_operand->length = calculate_operand_data_length(output_operand);
    if (output_operand->length <= 0) {
-        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n");
-        return DNN_ERROR;
+        return (void *)DNN_ERROR;
    }
    output_operand->data = av_realloc(output_operand->data, output_operand->length);
    if (!output_operand->data) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
-        return DNN_ERROR;
+        return (void *)DNN_ERROR;
    }
    output = output_operand->data;
    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
    av_assert0(channel == conv_params->input_num);
-    for (int y = pad_size; y < height - pad_size; ++y) {
+    for (int y = thread_start; y < thread_end; ++y) {
        for (int x = pad_size; x < width - pad_size; ++x) {
            for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
                if (conv_params->has_bias)
@@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
            output += conv_params->output_num;
        }
    }
-    return 0;
+    return (void *)DNN_SUCCESS;
 }
 int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
 {
    int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count())
        ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
 #if HAVE_PTHREAD_CANCEL
    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
 #endif
    thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param));
    void *res;
    int error_flag = DNN_SUCCESS;
    //struct used to pass parameters
    thread_common_param thread_common_param;
    thread_common_param.operands = operands;
    thread_common_param.input_operand_indexes = input_operand_indexes;
    thread_common_param.output_operand_index = output_operand_index;
    thread_common_param.parameters = parameters;
    thread_common_param.ctx = ctx;
 #if HAVE_PTHREAD_CANCEL
    thread_common_param.thread_num = thread_num;
    //create threads
    for (int i = 0; i < thread_num; i++){
        thread_param[i] = av_malloc(sizeof(thread_param));
        thread_param[i]->thread_common_param = &thread_common_param;
        thread_param[i]->thread_index = i;
        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]);
    }
    //join threads, res gets function return
    for (int i = 0; i < thread_num; i++){
        pthread_join(thread_id[i], &res);
        if ((int)res != DNN_SUCCESS)
            error_flag = (int)res;
    }
    //release memory
    av_free(thread_id);
    for (int i = 0; i < thread_num; i++){
        av_free(thread_param[i]);
    }
 #else
    thread_common_param.thread_num = 1;
    thread_param[0] = av_malloc(sizeof(thread_param));
    thread_param[0]->thread_common_param = &thread_common_param;
    thread_param[0]->thread_index = 0;
    res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
    if ((int)res != DNN_SUCCESS)
        error_flag = (int)res;
    av_free(thread_param[0]);
 #endif
    av_free(thread_param);
    return error_flag;
 }
--- a/tests/dnn/dnn-layer-conv2d-test.c
+++ b/tests/dnn/dnn-layer-conv2d-test.c
@@ -25,6 +25,8 @@
 #define EPSON 0.00001
 extern const AVClass dnn_native_class;
 static int test_with_same_dilate(void)
 {
    // the input data and expected data are generated with below python code.
@@ -96,6 +98,10 @@ static int test_with_same_dilate(void)
    };
    float bias[2] = { -1.6574852, -0.72915393 };
    NativeContext ctx;
    ctx.class = &dnn_native_class;
    ctx.options.conv2d_threads = 1;
    params.activation = TANH;
    params.has_bias = 1;
    params.biases = bias;
@@ -114,7 +120,7 @@ static int test_with_same_dilate(void)
    operands[1].data = NULL;
    input_indexes[0] = 0;
-    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
+    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
    output = operands[1].data;
    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
@@ -196,6 +202,10 @@ static int test_with_valid(void)
    };
    float bias[2] = { -0.4773722, -0.19620377 };
    NativeContext ctx;
    ctx.class = &dnn_native_class;
    ctx.options.conv2d_threads = 1;
    params.activation = TANH;
    params.has_bias = 1;
    params.biases = bias;
@@ -214,7 +224,7 @@ static int test_with_valid(void)
    operands[1].data = NULL;
    input_indexes[0] = 0;
-    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
+    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
    output = operands[1].data;
    for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {