/*
 * This confidential and proprietary software may be used only as
 * authorised by a licensing agreement from Arm Limited.
 *    Copyright 2016-2020 Arm Ltd. All Rights Reserved.
 * The entire notice above must be reproduced on all authorised
 * copies and copies may only be made to the extent permitted
 * by a licensing agreement from Arm Limited.
 */

#define M_PIf 3.14159265358979f

/**
\brief This kernel performs a sobel convolution on the input buffer and outputs the edge gradient magnitudes in a buffer

\param[in] input The input buffer in packed RGB888 format
\param[out] sobel_out The edge gradient magnitudes
\param[in] width The input (and output) image width
\param[in] height The input (and output) image height
\param[in] stride The input row stride in bytes
*/
__kernel void convolve_sobel(__global uchar *input, __global float *sobel_out, int width, int height, int stride)
{
    //multiplied by 2 because vectorised
    const int x = get_global_id(0) * 2;
    const int y = get_global_id(1);

    //if a write would be out of bounds, write nothing
    if(x + 1 >= width || y >= height)
        return;

    //if the checks below may read out of bounds, write a blank
    if(y - 1 < 0 || x - 1 < 0 || x + 2 >= width || y + 1 >= height)
    {
        vstore2(0, 0, &sobel_out[y * width + x]);
        return;
    }

    __global uchar *location = &input[y * stride + x * 3];

    short8 l1, md, r1;

    int8 sumx, sumy;

    //perform first row of sobel convolution
    l1 = convert_short8(vload8(0, location - stride - 3));
    md = convert_short8(vload8(0, location - stride));
    r1 = convert_short8(vload8(0, location - stride + 3));

    sumx = convert_int8(-l1 + r1);
    sumy = convert_int8(l1 + (short)2 * md + r1);

    //perform second row of sobel convolution. md is not needed as the middle element in both sobel_y and sobel_x filters are both 0
    l1 = convert_short8(vload8(0, location - 3));
    r1 = convert_short8(vload8(0, location + 3));

    sumx += convert_int8((short)2 * (-l1 + r1));

    //perform third row of sobel convolution
    l1 = convert_short8(vload8(0, location + stride - 3));
    md = convert_short8(vload8(0, location + stride));
    r1 = convert_short8(vload8(0, location + stride + 3));

    sumx += convert_int8(-l1 + r1);
    sumy += convert_int8(-l1 + (short) - 2 * md - r1);

    //work out gradient magnitude
    float8 grads = native_sqrt(convert_float8(sumx * sumx + sumy * sumy));

    //get absolute gradient from rgb pixel
    float2 out = (float2)(fast_length(grads.s012), fast_length(grads.s345));

    //prevent overflow
    out = clamp(out, 0.0f, 255.0f);

    //store out gradients
    vstore2(out, 0, &sobel_out[y * width + x]);
}
