GPU is more slower than CPU with opencl and opencv

Hi Guys,

i tested image with 2560*1440 and 100 iterations, my test code is below. The cpu execution time is around 30ms per iteration and the gpu is around 120ms per iteration.
The system i used is lebian_9 with prebuild tensorflow. I found libOpenCL.so in /usr/lib/aarch64-linux-gnu and include file in /usr/include/aarch64-linux-gnu/CL. That’s what i used for build with opencv 3.4.3:

 -D WITH_OPENCL=OFF
 -D HAVE_OPENCL_STATIC=ON
 -D OPENCL_LIBRARIES=/usr/lib/aarch64-linux-gnu/libOpenCL.so
 -D OPENCL_INCLUDE_DIRS=/usr/include/aarch64-linux-gnu/CL

I am wondering if the library and the header files I used are correct for GPU Mali G72 or not.
Anyone has a explanation why GPU is more slower than CPU?
Thanks for help.
Théo

#include "opencv2/opencv.hpp"
#include "opencv2/core/ocl.hpp"
#include <iostream>
#include <stdio.h>

using namespace cv;
using namespace std;

int main(int argc, char** argv)  
{   
    ocl::setUseOpenCL(true);
    if (ocl::haveOpenCL())
    {
        cout << "OpenCL is available..." << endl;
        //return;
    }

cv::ocl::Context context;
if (!context.create(cv::ocl::Device::TYPE_GPU))
{
    //cout << "Failed creating the context..." << endl;
    //return;
} 
cout << context.ndevices() << " GPU devices are detected." << endl;
for (int i = 0; i < context.ndevices(); i++)
{
    cv::ocl::Device device = context.device(i);
    cout << "name                 : " << device.name() << endl;
    cout << "available            : " << device.available() << endl;
    cout << "imageSupport         : " << device.imageSupport() << endl;
    cout << "OpenCL_C_Version     : " << device.OpenCL_C_Version() << endl;
    cout << endl;
}

UMat img, gray;
imread("image_2560.jpg", IMREAD_COLOR).copyTo(img);
//img = imread("image_2560.jpg", 1);
int64 t=getTickCount();
for(int i=0; i<100; i++)
{   
    int64 t1=getTickCount();
cvtColor(img, gray, COLOR_BGR2GRAY);
GaussianBlur(gray, gray, Size(7, 7), 1.5);
Canny(gray, gray, 0, 50);
    t1 = getTickCount() - t1;
    printf("Time elapsed t1: %fms\n", t1*1000/getTickFrequency());
}
t = getTickCount() - t;
printf("Time elapsed t: %fms\n", t*1000/getTickFrequency());
return 0;

}