并行加速实战 二维中值滤波器

中值滤波器使用了快速3x3中值滤波器 数据类型16U


摘要

我们以下将使用

1. SIMD: SSE, AVX

2. multiThread: openmp, std::thread

3. SIMD + multiThread: AVX + openmp

4. data: 分行并行加速,分块儿并行加速


这里先把文末的总结写出来

总结:

1.1 快速3X3中值滤波本身的计算是元素比较和排序,

属于控制密集型,而非运算密集型

所以带宽的耗时比计算的耗时明显

1.2 由于算法需要,数据IO使用的是loadu和storeu,而不是load和store。读写访问速度要慢一些。

2. 单核版本和SSE加速版本耗时几乎一样,这里是因为VS编译器的优化使得快速3X3中值滤波速度很快

计算速度瓶颈主要在内存访问

3. omp加速版本相比单核的快了3.65倍,接近4倍,和四核加速比较符合

4. omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大

5. avx加速是sse加速的1.62倍,接近2倍,符合128位数据和256位数据特性

6. SIMD + multiThread : 这里测试了avx + omp

    6.1 在avx的基础上增加omp多线程确实能增加3.65倍速度,接近4倍,和四核加速比较符合

    6.2 avx + omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大

7. std::thread的额外开销好像很大,分块并行反而更慢了,近20倍

8* neon版本在文末,暂时没有测试过


先给出宏定义的操作

#define med_op(a,b,t)	{t = a; a = MAX(a, b); b = MIN(t, b);}
#define med_op_sse(a,b,t)	{t = a; a = _mm_max_epu16(a, b); b = _mm_min_epu16(t, b); }
#define med_op_neon(a,b,t)	{t = a; a = vmaxq_u16(a, b); b = vminq_u16(t, b); }
#define med_op_avx(a,b,t)	{t = a; a = _mm256_max_epu16(a, b); b = _mm256_min_epu16(t, b); }

这里先介绍单核版本的

void medianFilt::median3(U16 *src, U16 *dst, S32 width, S32 height)
{
    U16 p0 = 0;
    U16 p1 = 0;
    U16 p2 = 0;
    U16 p3 = 0;
    U16 p4 = 0;
    U16 p5 = 0;
    U16 p6 = 0;
    U16 p7 = 0;
    U16 p8 = 0;
    U16 tmp = 0;

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    memcpy(dst, src, width * sizeof(U16));

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        for (i = 1; i < width - 1; i++)
        {
            i0 = i - 1;
            i2 = i + 1;

            p0 = psrc1[i0];
            p1 = psrc1[i];
            p2 = psrc1[i2];
            p3 = psrc2[i0];
            p4 = psrc2[i];
            p5 = psrc2[i2];
            p6 = psrc3[i0];
            p7 = psrc3[i];
            p8 = psrc3[i2];

            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p1, tmp);
            med_op(p3, p4, tmp);
            med_op(p6, p7, tmp);
            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p3, tmp);
            med_op(p5, p8, tmp);
            med_op(p4, p7, tmp);
            med_op(p3, p6, tmp);
            med_op(p1, p4, tmp);
            med_op(p2, p5, tmp);
            med_op(p4, p7, tmp);
            med_op(p4, p2, tmp);
            med_op(p6, p4, tmp);
            med_op(p4, p2, tmp);

            pdst[i] = p4;;
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }

    memcpy(pdst, psrc2, width * sizeof(U16));
}

openmp行分块的加速

// median omp
void medianFilt::median3_omp(U16 *src, U16 *dst, S32 width, S32 height)
{
    S32 j = 0;

    memcpy(dst, src, width * sizeof(U16));
    memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));

#pragma omp parallel for
    for (j = 1; j < height - 1; j++)
    {
        U16 *psrc2 = src + width * j;
        U16 *pdst = dst + width * j;
        U16 *psrc1 = psrc2 - width;
        U16 *psrc3 = psrc2 + width;

        U16 p0 = 0;
        U16 p1 = 0;
        U16 p2 = 0;
        U16 p3 = 0;
        U16 p4 = 0;
        U16 p5 = 0;
        U16 p6 = 0;
        U16 p7 = 0;
        U16 p8 = 0;
        U16 tmp = 0;

        S32 i0 = 0;
        S32 i = 0;
        S32 i2 = 0;

        pdst[0] = psrc2[0];

        for (i = 1; i < width - 1; i++)
        {
            i0 = i - 1;
            i2 = i + 1;

            p0 = psrc1[i0];
            p1 = psrc1[i];
            p2 = psrc1[i2];
            p3 = psrc2[i0];
            p4 = psrc2[i];
            p5 = psrc2[i2];
            p6 = psrc3[i0];
            p7 = psrc3[i];
            p8 = psrc3[i2];

            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p1, tmp);
            med_op(p3, p4, tmp);
            med_op(p6, p7, tmp);
            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p3, tmp);
            med_op(p5, p8, tmp);
            med_op(p4, p7, tmp);
            med_op(p3, p6, tmp);
            med_op(p1, p4, tmp);
            med_op(p2, p5, tmp);
            med_op(p4, p7, tmp);
            med_op(p4, p2, tmp);
            med_op(p6, p4, tmp);
            med_op(p4, p2, tmp);

            pdst[i] = p4;;
        }

        pdst[i] = psrc2[i];
    }
}

中值滤波分块版本(若干行的块儿),后面会被调用

static void median3_section(U16 *src, U16 *dst, S32 width, S32 height)
{
    U16 p0 = 0;
    U16 p1 = 0;
    U16 p2 = 0;
    U16 p3 = 0;
    U16 p4 = 0;
    U16 p5 = 0;
    U16 p6 = 0;
    U16 p7 = 0;
    U16 p8 = 0;
    U16 tmp = 0;

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        for (i = 1; i < width - 1; i++)
        {
            i0 = i - 1;
            i2 = i + 1;

            p0 = psrc1[i0];
            p1 = psrc1[i];
            p2 = psrc1[i2];
            p3 = psrc2[i0];
            p4 = psrc2[i];
            p5 = psrc2[i2];
            p6 = psrc3[i0];
            p7 = psrc3[i];
            p8 = psrc3[i2];

            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p1, tmp);
            med_op(p3, p4, tmp);
            med_op(p6, p7, tmp);
            med_op(p1, p2, tmp);
            med_op(p4, p5, tmp);
            med_op(p7, p8, tmp);
            med_op(p0, p3, tmp);
            med_op(p5, p8, tmp);
            med_op(p4, p7, tmp);
            med_op(p3, p6, tmp);
            med_op(p1, p4, tmp);
            med_op(p2, p5, tmp);
            med_op(p4, p7, tmp);
            med_op(p4, p2, tmp);
            med_op(p6, p4, tmp);
            med_op(p4, p2, tmp);

            pdst[i] = p4;;
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }
}

中值滤波使用分四块的openmp加速 (这里是针对290列的图像写的偏移72和74)

void medianFilt::median3_omp4(U16 *src, U16 *dst, S32 width, S32 height)
{
    //     const S32 CPU_CORE_NUM = 4;

    S32 j = 0;
    S32 offset = 0;

    memcpy(dst, src, width * sizeof(U16));
    memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));

#pragma omp parallel for
    for (j = 0; j < 4; j++)
    {
        offset = j * width * 72;
        median3_section(src + offset, dst + offset, width, 74);
    }
}

中值滤波器使用std::thread做4线程加速(四块儿)(这里是针对290列的图像写的偏移72和74)

void medianFilt::median3_4thd(U16 *src, U16 *dst, S32 width, S32 height)
{
//     const S32 CPU_CORE_NUM = 4;

    S32 j = 0;
    S32 offset = 0;

    U16 *psrc = NULL;
    U16 *pdst = NULL;

    memcpy(dst, src, width * sizeof(U16));
    memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));

    psrc = src;
    pdst = dst;
    std::thread t1(median3_section, psrc, pdst, width, 74);

    offset = 72 * width;
    psrc = src + offset;
    pdst = dst + offset;
    std::thread t2(median3_section, psrc, pdst, width, 74);

    offset = 144 * width;
    psrc = src + offset;
    pdst = dst + offset;
    std::thread t3(median3_section, psrc, pdst, width, 74);

    offset = 216 * width;
    psrc = src + offset;
    pdst = dst + offset;
    std::thread t4(median3_section, psrc, pdst, width, 74);

    t1.detach();
    t2.detach();
    t3.detach();
    t4.join();
}

中值滤波使用SSE加速

//  median SSE version
void medianFilt::median3_sse(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 8

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;
    const S32 xend = (width - 2) % MED_VEC_SZ + 1;

    __m128i v0;
    __m128i v1;
    __m128i v2;
    __m128i v3;
    __m128i v4;
    __m128i v5;
    __m128i v6;
    __m128i v7;
    __m128i v8;
    __m128i vtmp;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    memcpy(dst, src, width * sizeof(U16));

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        v0 = _mm_loadu_si128((const __m128i*)(psrc1));
        v1 = _mm_loadu_si128((const __m128i*)(psrc1 + 1));
        v2 = _mm_loadu_si128((const __m128i*)(psrc1 + 2));
        v3 = _mm_loadu_si128((const __m128i*)(psrc2));
        v4 = _mm_loadu_si128((const __m128i*)(psrc2 + 1));
        v5 = _mm_loadu_si128((const __m128i*)(psrc2 + 2));
        v6 = _mm_loadu_si128((const __m128i*)(psrc3));
        v7 = _mm_loadu_si128((const __m128i*)(psrc3 + 1));
        v8 = _mm_loadu_si128((const __m128i*)(psrc3 + 2));

        med_op_sse(v1, v2, vtmp);
        med_op_sse(v4, v5, vtmp);
        med_op_sse(v7, v8, vtmp);
        med_op_sse(v0, v1, vtmp);
        med_op_sse(v3, v4, vtmp);
        med_op_sse(v6, v7, vtmp);
        med_op_sse(v1, v2, vtmp);
        med_op_sse(v4, v5, vtmp);
        med_op_sse(v7, v8, vtmp);
        med_op_sse(v0, v3, vtmp);
        med_op_sse(v5, v8, vtmp);
        med_op_sse(v4, v7, vtmp);
        med_op_sse(v3, v6, vtmp);
        med_op_sse(v1, v4, vtmp);
        med_op_sse(v2, v5, vtmp);
        med_op_sse(v4, v7, vtmp);
        med_op_sse(v4, v2, vtmp);
        med_op_sse(v6, v4, vtmp);
        med_op_sse(v4, v2, vtmp);

        _mm_storeu_si128((__m128i*)(pdst + 1), v4);

        for (i = xend; i < width - 1; i += MED_VEC_SZ)
        {
            i0 = i - 1;
            i2 = i + 1;

            v0 = _mm_loadu_si128((const __m128i*)(psrc1 + i0));
            v1 = _mm_loadu_si128((const __m128i*)(psrc1 + i));
            v2 = _mm_loadu_si128((const __m128i*)(psrc1 + i2));
            v3 = _mm_loadu_si128((const __m128i*)(psrc2 + i0));
            v4 = _mm_loadu_si128((const __m128i*)(psrc2 + i));
            v5 = _mm_loadu_si128((const __m128i*)(psrc2 + i2));
            v6 = _mm_loadu_si128((const __m128i*)(psrc3 + i0));
            v7 = _mm_loadu_si128((const __m128i*)(psrc3 + i));
            v8 = _mm_loadu_si128((const __m128i*)(psrc3 + i2));

            med_op_sse(v1, v2, vtmp);
            med_op_sse(v4, v5, vtmp);
            med_op_sse(v7, v8, vtmp);
            med_op_sse(v0, v1, vtmp);
            med_op_sse(v3, v4, vtmp);
            med_op_sse(v6, v7, vtmp);
            med_op_sse(v1, v2, vtmp);
            med_op_sse(v4, v5, vtmp);
            med_op_sse(v7, v8, vtmp);
            med_op_sse(v0, v3, vtmp);
            med_op_sse(v5, v8, vtmp);
            med_op_sse(v4, v7, vtmp);
            med_op_sse(v3, v6, vtmp);
            med_op_sse(v1, v4, vtmp);
            med_op_sse(v2, v5, vtmp);
            med_op_sse(v4, v7, vtmp);
            med_op_sse(v4, v2, vtmp);
            med_op_sse(v6, v4, vtmp);
            med_op_sse(v4, v2, vtmp);

            _mm_storeu_si128((__m128i*)(pdst + i), v4);
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }

    memcpy(pdst, psrc2, width * sizeof(U16));

#undef MED_VEC_SZ
}

中值滤波使用AVX加速

//  median AVX version
void medianFilt::median3_avx(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;
    const S32 xend = (width - 2) % MED_VEC_SZ + 1;

    __m256i v0;
    __m256i v1;
    __m256i v2;
    __m256i v3;
    __m256i v4;
    __m256i v5;
    __m256i v6;
    __m256i v7;
    __m256i v8;
    __m256i vtmp;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    memcpy(dst, src, width * sizeof(U16));

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
        v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
        v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
        v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
        v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
        v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
        v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
        v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
        v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));

        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v1, vtmp);
        med_op_avx(v3, v4, vtmp);
        med_op_avx(v6, v7, vtmp);
        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v3, vtmp);
        med_op_avx(v5, v8, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v3, v6, vtmp);
        med_op_avx(v1, v4, vtmp);
        med_op_avx(v2, v5, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v4, v2, vtmp);
        med_op_avx(v6, v4, vtmp);
        med_op_avx(v4, v2, vtmp);

        _mm256_storeu_si256((__m256i*)(pdst + 1), v4);

        for (i = xend; i < width - 1; i += MED_VEC_SZ)
        {
            i0 = i - 1;
            i2 = i + 1;

            v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
            v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
            v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
            v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
            v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
            v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
            v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
            v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
            v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));

            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v1, vtmp);
            med_op_avx(v3, v4, vtmp);
            med_op_avx(v6, v7, vtmp);
            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v3, vtmp);
            med_op_avx(v5, v8, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v3, v6, vtmp);
            med_op_avx(v1, v4, vtmp);
            med_op_avx(v2, v5, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v4, v2, vtmp);
            med_op_avx(v6, v4, vtmp);
            med_op_avx(v4, v2, vtmp);

            _mm256_storeu_si256((__m256i*)(pdst + i), v4);
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }

    memcpy(pdst, psrc2, width * sizeof(U16));

#undef MED_VEC_SZ
}

中值滤波使用AVX + 分行的openmp加速

//  median AVX omp version
void medianFilt::median3_avx_omp(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16

    S32 j = 0;
    const S32 xend = (width - 2) % MED_VEC_SZ + 1;

    memcpy(dst, src, width * sizeof(U16));
    memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));

#pragma omp parallel for
    for (j = 1; j < height - 1; j++)
    {
        U16 *psrc2 = src + width * j;
        U16 *pdst = dst + width * j;
        U16 *psrc1 = psrc2 - width;
        U16 *psrc3 = psrc2 + width;

        __m256i v0;
        __m256i v1;
        __m256i v2;
        __m256i v3;
        __m256i v4;
        __m256i v5;
        __m256i v6;
        __m256i v7;
        __m256i v8;
        __m256i vtmp;

        S32 i0 = 0;
        S32 i = 0;
        S32 i2 = 0;

        pdst[0] = psrc2[0];

        v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
        v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
        v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
        v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
        v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
        v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
        v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
        v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
        v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));

        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v1, vtmp);
        med_op_avx(v3, v4, vtmp);
        med_op_avx(v6, v7, vtmp);
        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v3, vtmp);
        med_op_avx(v5, v8, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v3, v6, vtmp);
        med_op_avx(v1, v4, vtmp);
        med_op_avx(v2, v5, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v4, v2, vtmp);
        med_op_avx(v6, v4, vtmp);
        med_op_avx(v4, v2, vtmp);

        _mm256_storeu_si256((__m256i*)(pdst + 1), v4);

        for (i = xend; i < width - 1; i += MED_VEC_SZ)
        {
            i0 = i - 1;
            i2 = i + 1;

            v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
            v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
            v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
            v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
            v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
            v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
            v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
            v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
            v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));

            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v1, vtmp);
            med_op_avx(v3, v4, vtmp);
            med_op_avx(v6, v7, vtmp);
            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v3, vtmp);
            med_op_avx(v5, v8, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v3, v6, vtmp);
            med_op_avx(v1, v4, vtmp);
            med_op_avx(v2, v5, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v4, v2, vtmp);
            med_op_avx(v6, v4, vtmp);
            med_op_avx(v4, v2, vtmp);

            _mm256_storeu_si256((__m256i*)(pdst + i), v4);
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }

#undef MED_VEC_SZ
}

中值滤波多行分块的AVX实现

static void median3_avx_section(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;
    const S32 xend = (width - 2) % MED_VEC_SZ + 1;

    __m256i v0;
    __m256i v1;
    __m256i v2;
    __m256i v3;
    __m256i v4;
    __m256i v5;
    __m256i v6;
    __m256i v7;
    __m256i v8;
    __m256i vtmp;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
        v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
        v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
        v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
        v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
        v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
        v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
        v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
        v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));

        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v1, vtmp);
        med_op_avx(v3, v4, vtmp);
        med_op_avx(v6, v7, vtmp);
        med_op_avx(v1, v2, vtmp);
        med_op_avx(v4, v5, vtmp);
        med_op_avx(v7, v8, vtmp);
        med_op_avx(v0, v3, vtmp);
        med_op_avx(v5, v8, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v3, v6, vtmp);
        med_op_avx(v1, v4, vtmp);
        med_op_avx(v2, v5, vtmp);
        med_op_avx(v4, v7, vtmp);
        med_op_avx(v4, v2, vtmp);
        med_op_avx(v6, v4, vtmp);
        med_op_avx(v4, v2, vtmp);

        _mm256_storeu_si256((__m256i*)(pdst + 1), v4);

        for (i = xend; i < width - 1; i += MED_VEC_SZ)
        {
            i0 = i - 1;
            i2 = i + 1;

            v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
            v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
            v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
            v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
            v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
            v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
            v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
            v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
            v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));

            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v1, vtmp);
            med_op_avx(v3, v4, vtmp);
            med_op_avx(v6, v7, vtmp);
            med_op_avx(v1, v2, vtmp);
            med_op_avx(v4, v5, vtmp);
            med_op_avx(v7, v8, vtmp);
            med_op_avx(v0, v3, vtmp);
            med_op_avx(v5, v8, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v3, v6, vtmp);
            med_op_avx(v1, v4, vtmp);
            med_op_avx(v2, v5, vtmp);
            med_op_avx(v4, v7, vtmp);
            med_op_avx(v4, v2, vtmp);
            med_op_avx(v6, v4, vtmp);
            med_op_avx(v4, v2, vtmp);

            _mm256_storeu_si256((__m256i*)(pdst + i), v4);
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }
#undef MED_VEC_SZ
}

中值滤波AVX + 分四块openmp的算法加速 (这里是针对290列的图像写的偏移72和74)

void medianFilt::median3_avx_omp4(U16 *src, U16 *dst, S32 width, S32 height)
{
    //     const S32 CPU_CORE_NUM = 4;

    S32 j = 0;
    S32 offset = 0;

    memcpy(dst, src, width * sizeof(U16));
    memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));

#pragma omp parallel for
    for (j = 0; j < 4; j++)
    {
        offset = j * width * 72;
        median3_avx_section(src + offset, dst + offset, width, 74);
    }
}


使用了opencv读图并计算时间消耗

void median1_testP()
{
    Mat src, dst_, dst, delta;

    src = imread("D:/d.pgm", -1);

    medianBlur(src, dst, 3);

    dst_ = src.clone();

    U16 *psrc = (U16 *)src.data;
    U16 *pdst = (U16 *)dst_.data;

    double t;

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("%.3fms\n", t);
    
    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_omp(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("omp %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_omp4(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("omp4 %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_4thd(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("4thd %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_sse(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("sse %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_avx(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("avx %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_avx_omp(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("avx omp %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        medianFilt::median3_avx_omp4(psrc, pdst, src.cols, src.rows);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("avx omp4 %.3fms\n", t);

    Rect roi(1, 1, 498, 288);
    delta = (dst_ - dst)(roi);
}

#if NEON_ENABLE
//  median NEON version
void medianFilt::median3_neon(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 8

    S32 i0 = 0;
    S32 i = 0;
    S32 i2 = 0;
    S32 j = 0;
    const S32 xend = (width - 2) % MED_VEC_SZ + 1;

    uint16x8_t v0;
    uint16x8_t v1;
    uint16x8_t v2;
    uint16x8_t v3;
    uint16x8_t v4;
    uint16x8_t v5;
    uint16x8_t v6;
    uint16x8_t v7;
    uint16x8_t v8;
    uint16x8_t vtmp;

    U16 *psrc1 = NULL;
    U16 *psrc2 = NULL;
    U16 *psrc3 = NULL;
    U16 *pdst = NULL;

    memcpy(dst, src, width * sizeof(U16));

    psrc1 = src;
    psrc2 = psrc1 + width;
    psrc3 = psrc2 + width;
    pdst = dst + width;

    for (j = 1; j < height - 1; j++)
    {
        pdst[0] = psrc2[0];

        v0 = vld1q_u16(psrc1);
        v1 = vld1q_u16(psrc1 + 1);
        v2 = vld1q_u16(psrc1 + 2);
        v3 = vld1q_u16(psrc2);
        v4 = vld1q_u16(psrc2 + 1);
        v5 = vld1q_u16(psrc2 + 2);
        v6 = vld1q_u16(psrc3);
        v7 = vld1q_u16(psrc3 + 1);
        v8 = vld1q_u16(psrc3 + 2);

        med_op_neon(v1, v2, vtmp);
        med_op_neon(v4, v5, vtmp);
        med_op_neon(v7, v8, vtmp);
        med_op_neon(v0, v1, vtmp);
        med_op_neon(v3, v4, vtmp);
        med_op_neon(v6, v7, vtmp);
        med_op_neon(v1, v2, vtmp);
        med_op_neon(v4, v5, vtmp);
        med_op_neon(v7, v8, vtmp);
        med_op_neon(v0, v3, vtmp);
        med_op_neon(v5, v8, vtmp);
        med_op_neon(v4, v7, vtmp);
        med_op_neon(v3, v6, vtmp);
        med_op_neon(v1, v4, vtmp);
        med_op_neon(v2, v5, vtmp);
        med_op_neon(v4, v7, vtmp);
        med_op_neon(v4, v2, vtmp);
        med_op_neon(v6, v4, vtmp);
        med_op_neon(v4, v2, vtmp);

        vst1q_u16((pdst + 1), v4);

        for (i = xend; i < width - 1; i += MED_VEC_SZ)
        {
            i0 = i - 1;
            i2 = i + 1;

            v0 = vld1q_u16(psrc1 + i0);
            v1 = vld1q_u16(psrc1 + i);
            v2 = vld1q_u16(psrc1 + i2);
            v3 = vld1q_u16(psrc2 + i0);
            v4 = vld1q_u16(psrc2 + i);
            v5 = vld1q_u16(psrc2 + i2);
            v6 = vld1q_u16(psrc3 + i0);
            v7 = vld1q_u16(psrc3 + i);
            v8 = vld1q_u16(psrc3 + i2);

            med_op_neon(v1, v2, vtmp);
            med_op_neon(v4, v5, vtmp);
            med_op_neon(v7, v8, vtmp);
            med_op_neon(v0, v1, vtmp);
            med_op_neon(v3, v4, vtmp);
            med_op_neon(v6, v7, vtmp);
            med_op_neon(v1, v2, vtmp);
            med_op_neon(v4, v5, vtmp);
            med_op_neon(v7, v8, vtmp);
            med_op_neon(v0, v3, vtmp);
            med_op_neon(v5, v8, vtmp);
            med_op_neon(v4, v7, vtmp);
            med_op_neon(v3, v6, vtmp);
            med_op_neon(v1, v4, vtmp);
            med_op_neon(v2, v5, vtmp);
            med_op_neon(v4, v7, vtmp);
            med_op_neon(v4, v2, vtmp);
            med_op_neon(v6, v4, vtmp);
            med_op_neon(v4, v2, vtmp);

            vst1q_u16((pdst + i), v4);
        }

        pdst[i] = psrc2[i];

        psrc1 += width;
        psrc2 += width;
        psrc3 += width;
        pdst += width;
    }

    memcpy(pdst, psrc2, width * sizeof(U16));

#undef MED_VEC_SZ
}
#endif


现在使用测试代码测试速度(我的平台是4核CPU)

1 core : 0.157ms
omp : 0.043ms
omp 4 sections : 0.043ms
4threads : 2.919ms
sse : 0.154ms
avx : 0.095ms
avx omp : 0.026ms
avx omp 4 sections : 0.028ms


总结:

1.1 快速3X3中值滤波本身的计算是元素比较和排序,

属于控制密集型,而非运算密集型

所以带宽的耗时比计算的耗时明显

1.2 由于算法需要,数据IO使用的是loadu和storeu,而不是load和store。读写访问速度要慢一些。

2. 单核版本和SSE加速版本耗时几乎一样,这里是因为VS编译器的优化使得快速3X3中值滤波速度很快

计算速度瓶颈主要在内存访问

3. omp加速版本相比单核的快了3.65倍,接近4倍,和四核加速比较符合

4. omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大

5. avx加速是sse加速的1.62倍,接近2倍,符合128位数据和256位数据特性

6. SIMD + multiThread : 这里测试了avx + omp

    6.1 在avx的基础上增加omp多线程确实能增加3.65倍速度,接近4倍,和四核加速比较符合

    6.2 avx + omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大

7. std::thread的额外开销好像很大,分块并行反而更慢了,近20倍

8* neon版本在文末,暂时没有测试过

阅读更多

更多精彩内容