OpenCL C Built-IN Functions

最新推荐文章于 2023-12-09 22:55:16 发布
原创最新推荐文章于 2023-12-09 22:55:16 发布 · 4.8k 阅读
2 ·
CC 4.0 BY-SA版权
GPU 专栏收录该内容
4 篇文章
订阅专栏
//
// OpenCL C Built-IN Functions
//

//  Work-Item Functions
uint get_work_dim();
size_t get_global_size(uint dimindex);
size_t get_global_id(uint dimindex);
size_t get_local_size(uint dimindex);
size_t get_local_id(uint dimindex);
size_t get_num_groups(uint dimindex);
size_t get_group_id(uint dimindex);
size_t get_global_offset(uint dimindex);




// Math Functions
/*

*/
#define M_E        2.71828182845904523536
#define M_LOG2E    1.44269504088896340736
#define M_LOG10E   0.434294481903251827651
#define M_LN2      0.693147180559945309417
#define M_LN10     2.30258509299404568402
#define M_PI       3.14159265358979323846
#define M_PI_2     1.57079632679489661923
#define M_PI_4     0.785398163397448309616
#define M_1_PI     0.318309886183790671538
#define M_2_PI     0.636619772367581343076
#define M_2_SQRTPI 1.12837916709551257390
#define M_SQRT2    1.41421356237309504880
#define M_SQRT1_2  0.707106781186547524401


gentype acos(gentype x);             //  Compute the arc cosine of x.
gentype acosh(gentype x);           // Compute the inverse hyperbolic cosine of x.
gentype acospi(gentype x);          // Compute acos(x) / p.
gentype asin(gentype x);            // Compute the arc sine of x.
gentype asinh(gentype x);           // Compute the inverse hyperbolic sine of x.
gentype asinpi(gentype x);          // Compute asin(x) / p.
gentype atan(gentype y_over_x);    // Compute the arc tangent of y_over_x.
gentype atan2(gentype y, gentype x);    // Compute the arc tangent of y / x.
gentype atanh(gentype x);               // Compute the hyperbolic arc tangent of x.
gentype atanpi(gentype x);              // Compute atan(x) / p.
gentype atan2pi(gentype y, gentype x);   // Compute atan2(y, c) / p.
gentype cbrt(gentype x);                // Compute the cube root of x.
gentype ceil(gentype x);                // Round to an integral value using the round - to - positive - infinity rounding mode.
gentype copysign(gentype x, gentype y);   //Returns xwith its sign changed to match the sign of y.
gentype cos(gentype x);                   // Compute the cosine of x.
gentype cosh(gentype x);                // Compute the hyperbolic cosine of x.
gentype cospi(gentype x);               // Compute cos(px)
gentype erfc(gentype x);            // Compute the complementary error function 1.0 – erf(x).
gentype erf(gentype x);             // Compute the error function.For argument xthis is defined as
gentype exp(gentype x);             // Compute the base - e exponential of x.
gentype exp2(gentype x);            // Compute the base - 2 exponential of x.
gentype exp10(gentype x);           // Compute the base - 10 exponential of x.
gentype expm1(gentype x);           // Compute e^x - 1.0
gentype fabs(gentype x);            // Compute the absolute value of a floating - point number.
gentype fdim(gentype x, gentype y);  // Returns x– yif x > y, +0if xis less than or equal to y.
gentype floor(gentype x);           // Round to an integral value using the round - to - negative - infinity rounding mode.
gentype fma(gentype a, gentype b, gentype c);      /* Returns the correctly rounded floating - point representation of the
                                                    sum of cwith the infinitely precise product of aand b.Rounding of
                                                    intermediate products does not occur.Edge case behavior is per the
                                                    IEEE 754 - 2008 standard. */
gentype fmax(gentype x, gentype y);
gentypef fmax(gentypef x, float y);
gentyped fmax(gentyped x, double y);  /* Returns yif x < y; otherwise it returns x.If one argument is a NaN,
                                        fmax()returns the other argument.If both arguments are NaNs,
                                        fmax()returns a NaN. */

gentype fmin(gentype x, gentype y);
gentypef fmin(gentypef x, float y);
gentyped fmin(gentyped x, double y); /*
Returns yif y < x; otherwise it returns x.If one argument is a NaN,
fmin()returns the other argument.If both arguments are NaNs,
fmin()returns a NaN.  */
gentype fmod(gentype x, gentype y);  // Returns x– y* trunc(x / y).
gentype fract(gentype x, global gentype *iptr);
gentype fract(gentype x, local gentype *iptr);
gentype fract(gentype x, private gentype *iptr);    // Returns fmin(x– floor(x), 0x1.fffffep - 1f).floor(x)is returned in iptr.
gentype frexp(gentype x, global intn*exp);
gentype frexp(gentype x, local intn *exp);
gentype frexp(gentype x, private intn*exp);         /*
                                                    Extract mantissa and exponent from x.For each component the
                                                    mantissa returned is a float with magnitude in the interval[1 / 2, 1)
                                                    or0.Each component of xequals mantissa returned * 2
                                                    */
gentype hypot(gentype x, gentype y);                  // Compute the value of the square root of x
gentype ldexp(gentype x, intn exp);
gentype ldexp(gentype x, int exp);   // Returns x * 2^exp
gentype lgamma(gentype x);
gentype lgamma_r(gentype x, global intn*signp);
gentype lgamma_r(gentype x, local intn*signp);
gentype lgamma_r(gentype x, private intn*signp);     // Compute the log gamma function given by

gentype log(gentypex);                  // Compute the natural logarithm of x.
gentype log2(gentypex);                 // Compute the base - 2 logarithm of x.
gentype log10(gentypex);                // Compute the base - 10 logarithm of x.
gentype log1p(gentypex);                // Compute loge(1.0 + x).
gentype logb(gentypex);                 // Compute the exponent of x, which is the integral part of logr | x | .
gentypemad(gentype a, gentype b, gentype c);  /* madapproximates a* b + c.Whether or how the product of a* b
                                                is rounded and how supernormal or subnormal intermediate products are handled are not defined.madis intended to be used where
                                                speed is preferred over accuracy. */
gentype maxmag(gentype x, gentype y); // Returns xif | x | > | y | , yif | y | > | x | , otherwise fmax(x, y).
gentype minmag(gentype x, gentype y); // Returns xif | x | < | y | , yif | y | < | x | , otherwise fmin(x, y).
gentype modf(gentype x,global gentype *iptr)
gentype modf(gentype x,local gentype *iptr)
gentype modf(gentype x, private gentype *iptr) /*
                                                Decompose a floating - point number.The modffunction breaks the
                                                argument xinto integral and fractional parts, each of which has the
                                                same sign as the argument.It stores the integral part in the object
                                                pointed to by iptrand returns the fractional part. 
                                                */
float nan(uintn ancode);
floatn nan(uintn nancode);
double nan(uint nancode);
doublen nan(uintn nancode);  /*
                                Returns a quiet NaN.The nancodemay be placed in the significand
                                of the resulting NaN.
                                */
gentype nextafter(gentype x, gentype y);  /*
                            Compute the next representable single - or double - precision floatingpoint value following xin the direction of y.Thus, if yis less than x,
                            nextafterreturns the largest representable floating - point number
                            less than x
                            */

gentype pow(gentype x, gentype y);      // Compute xto the power y.
gentype pown(gentype x, intn y);        // Compute xto the power y, where yis an integer.
gentype powr(gentype x, gentype y);     // Compute xto the power y, where x >= 0.
gentype remainder(gentype x, gentype y); /*
                                        Compute the value rsuch that r = x– n* y, where nis the
                                        integer nearest the exact value of x / y.If there are two integers closest
                                        to x / y, nwill be the even one.If ris zero, it is given the same sign
                                        as x. 
                                        */
gentype remquo(gentype x, gentype y, global gentypei *quo);
gentype remquo(gentype x, gentype y, local gentypei *quo);
gentype remquo(gentype x, gentype y, private gentypei *quo); /*
                                        Compute the value rsuch that r = x– n* y, where nis the
                                        integer nearest the exact value of x / y.If there are two integers closest
                                        to x / y, nwill be the even one.If ris zero, it is given the same sign
                                        as x.
                                        This is the same value that is returned by the remainder function.
                                        remquoalso calculates the lower seven bits of the integral quotient
                                        x / yand gives that value the same sign as x / y.It stores this signed
                                        value in the object pointed to by quo.
                                        */

gentype rint(gentype x);            // Round to integral value(using round - to - nearest rounding mode) in floating - point format.
gentype rootn(gentype x, intn y);   // Compute xto the power 1 / y.
gentype round(gentype x);           //  Return the integral value nearest to x, rounding halfway cases away from zero, regardless of the current rounding direction.
gentype rsqrt(gentype x);           // Compute the inverse square root of x.
gentype sin(gentype x);             // Compute the sine of x

gentype sincos(gentype x, global gentype *cosval);
gentype sincos(gentype x, local gentype *cosval);
gentype sincos(gentype x, private gentype *cosval);    /*
                                                        Compute the sine and cosine of x.The computed sine is the return
                                                        value and the computed cosine is returned in cosval. */
gentype sinh(gentype x);         // Compute the hyperbolic sine of x.
gentype sinpi(gentype x);        // Compute sin(px).
gentype sqrt(gentype x);        // Compute the square root of x.
gentype tan(gentype x);         // Compute the tangent of x.
gentype tanh(gentype x);        // Compute the hyperbolic tangent of x.
gentype tanpi(gentype x);       // Compute tan(px).
gentype tgamma(gentype x);      // Compute the gamma function.
gentype trunc(gentype x);       // Round to integral value using the round - to - zero rounding mode

gentypef half_cos(gentypef x);  // Compute the cosine of x.xmust be in the range - 2
gentypef half_divide(gentypef x, gentypef y)  //Compute x / y.
gentypef half_exp(gentypef x);      // Compute the base - e exponential of x.
gentypef half_exp2(gentypef x);     // Compute the base - 2 exponential of x.
gentypef half_exp10(gentypef x);    // Compute the base - 10 exponential of x.
gentypef half_log(gentypef x);      // Compute the natural logarithm of x.
gentypef half_log2(gentypef x);     // Compute the base - 2 logarithm of x.
gentypef half_log10(gentypef x);    // Compute the base - 10 logarithm of x.
gentypef half_powr(gentypef x, gentypef y);   // Compute xto the power y, where x >= 0.
gentypef half_recip(gentypef x);    // Compute the reciprocal of x.
gentypef half_rsqrt(gentypef x);    // Compute the inverse square root of x.
gentypef half_sin(gentypef x);      //  Compute the sine of x.xmust be in the range - 2
gentypef half_sqrt(gentypef x);     // Compute the square root of x.
gentypef half_tan(gentypef x);      // Compute the tangent of x.xmust be in the range - 2
gentypef native_cos(gentypef x);    /* Compute the cosine of xover an implementation - defined range.The maximum error is implementation - defined.*/
gentypef native_divide(gentypef x, gentypef y);   // Compute x / yover an implementation - defined range.The maximum error is implementation - defined.

gentypef native_exp(gentypef x);    // Compute the base - e exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_exp2(gentypef x);   // Compute the base - 2 exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_exp10(gentypef x);  // Compute the base - 10 exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log(gentypef x);    // Compute the natural logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log2(gentypef x);   // Compute the base - 2 logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log10(gentypef x);  // Compute the base - 10 logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_recip(gentypef x);  // Compute the reciprocal of xover an implementation - defined range. The maximum error is implementation - defined.
gentypef native_rsqrt(gentypef x);  // Compute the inverse square root of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_sin(gentypef x);    // Compute the sine of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_sqrt(gentypef x);   // Compute the square root of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_tan(gentypef x);    // Compute the tangent of xover an implementation - defined range.Themaximum error is implementation - defined.




// Integer Functions

ugentype abs(gentype x); // Returns | x | .
ugentype abs_diff(gentype x, gentype y);  // Returns | x– y | without modulo overflow.
gentype add_sat(gentype x, gentype y);   // Returns x + yand saturates the result.
gentype hadd(gentype x, gentype y);  // Returns(x + y) >> 1. The intermediate sum does not modulo overflow.
gentype rhadd(gentype x, gentype y);  // Returns(x + y + 1) >> 1. The intermediate sum does not modulo overflow.
gentype clamp(gentype x, gentype minval, gentype maxval);  // Returns min(max(x, minval), maxval).
gentype clamp(gentype x, sgentype minval, sgentype maxval);  //Results are undefined if minval > maxval.
gentype clz(gentype x);      //  Returns the number of leading 0bits in x, starting at the most significant bit position.
gentype mad_hi(gentype a, gentype b, gentype c);
gentype mad_sat(gentype a, gentype b, gentype c);   // Returns mul_hi(a, b) + c.
gentype max(gentype x, gentype y)  // Returns a* b + cand saturates the result.
gentype max(gentype x, sgentype y)
gentype max(gentype x, gentype y);  // Returns yif x < y; otherwise it returns x.
gentype max(gentype x, sgentype y);

gentype mul_hi(gentypex, gentype y); // Computes x* yand returns the high half of the product of x and y.
gentype rotate(gentypev, gentype i);  /* For each element in v, the bits are shifted left by the number of
                                        bits given by the corresponding element in i(subject to the
                                        usual shift modulo rules described in the “Shift Operators”
                                        subsection of “Vector Operators” in Chapter 4).Bits shifted off
                                        the left side of the element are shifted back in from the right.
                                        */
gentype sub_sat(gentype x, gentype y);   // Returns x - y and saturates the result.
short upsample(char hi, uchar lo);
ushort upsample(uchar hi, uchar lo);
shortn upsample(charn hi, ucharn lo);
ushortn upsample(ucharn hi, ucharn lo);
int upsample(short hi, ushort lo);
uint upsample(ushort hi, ushort lo);
intn upsample(shortn hi, ushortn lo);
uintn upsample(ushortn hi, ushortn lo);
long upsample(int hi, uint lo);
ulong upsample(uint hi, uint lo);
longn upsample(intn hi, uintn lo);
ulongn upsample(uintn hi, uintn lo);
/*
If hiand loare scalar :
result = ((short)hi << 8) | lo
result = ((ushort)hi << 8) | lo
result = ((int)hi << 16) | lo
result = ((uint)hi << 16) | lo
result = ((long)hi << 32) | lo
result = ((ulong)hi << 32) | lo
If hiand loare scalar, then for each element of the vector :
result[i] = ((short)hi[i] << 8) | lo[i]
result[i] = ((ushort)hi[i] << 8) | lo[i]
result[i] = ((int)hi[i] << 16) | lo[i]
result[i] = ((uint)hi[i] << 16) | lo[i]
result[i] = ((long)hi[i] << 32) | lo[i]
result[i] = ((ulong)hi[i] << 32) | lo[i]
*/

gentype mad24(gentype x, gentype y, gentype z);  /* Multiply two 24 - bit integer values xand yusing mul24and add
                                                the 32 - bit integer result to the 32 - bit integer z.
                                                gentypemul24(gentypex, gentype y) Multiply two 24 - bit integer values xand y.xand yare 32 - bit
                                                integers but only the low 24 bits are used to perform the
                                                multiplication.mul24should be used only when values in x
                                                and yare in the range[-2^23, 2^23 - 1]
                                                if xand yare signed integers and in the range[0, 2^24 - 1]
                                                if xand yare unsigned integers.If xand yare not in this range, the multiplication
                                                result is implementation - defined.
                                                */






// Common Functions
gentype clamp(gentype x, gentype minval, gentype maxval);
gentypef clamp(gentypef x, float minval, float maxval);
gentyped clamp(gentyped x, double minval, double maxval); /*
    Returns fmin(fmax(x, minval), maxval).
    Results are undefined if minval > maxval. */

gentype degrees(gentype radians);  // Converts radiansto degrees; i.e., (180 / p) * radians.
gentype max(gentype x, gentype y);
gentypef max(gentypef x, float y);
gentyped max(gentyped x, double y);   /*
    Returns yif x < y; otherwise it returns x.This is similar to fmax
    described in Table 5.2 except that if xor yis infiniteor NaN, the
    return values are undefined. */
gentype min(gentype x, gentype y);
gentypef min(gentypef x, float y);
gentyped min(gentyped x, double y);   /*
    Returns yif y < x; otherwise it returns x.This is similar to fmin
    described in Table 5.2 except that if xor yis infiniteorNaN, the
    return values are undefined. */
gentype mix(gentype x, gentype y, gentype a);
gentypef mix(gentypef x, float y, gentype a);
gentyped mix(gentyped x, double y, gentype a); /*
    Returns the linear blend of xand yimplemented as
    x + (y– x) * a
    amust be a value in the range 0.0 … 1.0.If ais not in this range,
    the return values are undefined.
    */
gentype radians(gentype degrees); // Converts degreesto radians; i.e., (p / 180) * degrees.


gentype step(gentype edge, gentype x);
gentypef step(float edge, gentypef x);
gentyped step(double edge, gentyped x); /*
    Returns 0.0if x < edge; otherwise it returns 1.0.The step
    function can be used to create a discontinuous jump at an arbitrary point. */
gentype smoothstep(gentype edge0, gentype edge1, gentype x);
gentypef smoothstep(float edge0, float edge1, gentypef x);
gentyped smoothstep(double edge0, double edge1, gentyped x);
/*
    Returns 0.0if x <= edge0and 1.0if x >= edge1and performs a
    smooth hermite interpolation between 0and 1when edge0 < x<
    edge1.This is useful in cases where a threshold function with a
    smooth transition is needed.
    This is equivalent to the following where tis the same type as x :
t = clamp((x– edge0) / (edge1– edge0), 0, 1);
return t* t* (3 – 2 * t)
The results are undefined if edge0 >= edge1or if x, edge0, or
edge1is a NaN.
*/
gentype sign(gentype x); // Returns 1.0if x> 0, -0.0if x = -0.0, +0.0if x = +0.0, or - 1.0  if x < 0. Returns 0.0if xis a NaN



// Geometric Functions





// Relational Functions


float4 cross(float4 p0, float4 p1);
float3 cross(float3 p0, float3 p1);
double4 cross(double4 p0, double4 p1);
double3 cross(double3 p0, double3 p1);
/*
Returns the cross - product of p0.xyzand p1.xyz.The wcomponent of a 4 - component vector result returned will be 0.
The cross - product is specified only for a 3 - or 4 - component vector.
*/
float dot(gentypef p0, gentypef p1);
double dot(gentyped p0, gentyped p1);   // Returns the dot product of p0and p1.
float distance(gentypef p0, gentypef p1);
double distance(gentyped p0, gentyped p1);   // Returns the distance between p0and p1.This is calculated as length(p0– p1).
float length(gentypef p);
double length(gentyped p);
/*
Returns the length of vector p, i.e.,
3p.x2 + p.y2 + …
The length is calculated without overflow or extraordinary
precision loss due to underflow.
*/
gentypef normalize(gentypef p);
gentyped normalize(gentyped p);
/*
Returns a vector in the same direction as pbut with a length of
1.
normalize(p)function returns pif all elements of pare zero.
normalize(p)returns a vector full of NaNs if any element is a
NaN.
normalize(p)for which any element in pis infinite proceeds as
if the elements in pwere replaced as follows :
for (i = 0; i < sizeof(p) / sizeof(p[0]); i++)
p[i] = isinf(p[i])
?
copysign(1.0, p[i])
: 0.0 * p[i];
*/
float fast_distance(gentypef p0, gentypef p1);
/*
    Returns fast_length(p0– p1)

    floatfast_length(gentypefp) Returns the length of vector pcomputed as
    half_sqrt(p.x
    2
    + p.y
    2
    + …)
    gentypeffast_normalize(gentypefp) Returns a vector in the same direction as pbut with a length
    of1.
    fast_normalizeis computed as
    p* half_sqrt(p.x
    2
    + p.y
    2
    + …)
    The result will be within 8192 ulps error from the infinitely
    precise result of
if (all(p == 0.0f))
result = p;
else
result = p / sqrt(p.x
2
+ p.y
2
+ …)
It has the following exceptions :
• If the sum of squares is greater than FLT_MAX, then the value
of the floating - point values in the result vector is undefined.
• If the sum of squares is less than FLT_MIN, then the implementation may return back p.
• If the device is in “denorms are flushed to zero” mode,
individual operand elements with magnitude less than
sqrt(FLT_MIN)may be flushed to zero before proceeding
with the calculation.
*/



int isequal(float x, float y);
int isequal(double x, double y);
intn isequal(floatn x, floatn y);
longn isequal(doublen x, doublen y);  // Returns the component - wise compare of x == y.
int isnotequal(float x, float y);
int isnotequal(double x, double y);
intn isnotequal(floatn x, floatn y);
longn isnotequal(doublen x, doublen y);   // Returns the component - wise compare of x != y.
int isgreater(float x, float y);
int isgreater(double x, double y);
intn isgreater(floatn x, floatn y);
longn isgreater(doublen x, doublen y);    // Returns the component - wise compare of x > y.
int isgreaterequal(float x, float y);
int isgreaterequal(double x, double y);
intn isgreaterequal(floatn x, floatn y);
longn isgreaterequal(doublen x, doublen y);  // Returns the component - wise compare of x >= y.
int isless(float x, float y);
int isless(double x, double y);
intn isless(floatn x, floatn y);
longn isless(doublen x, doublen y);    //Returns the component - wise compare of x < y.
int islessequal(float x, float y);
int islessequal(double x, double y);
intn islessequal(floatn x, floatn y);
longn islessequal(doublen x, doublen y);   // Returns the component - wise compare of x <= y.
int islessgreater(float x, float y);
int islessgreater(double x, double y);
intn islessgreater(floatn x, floatn y);
longn islessgreater(doublen x, doublen y);   // Returns the component - wise compare of(x< y) || (x> y)



int isfinite(float x);
int isfinite(double x);
intn isfinite(floatn x);
longn isfinite(doublen x);  // Tests for the f i nite value of x.
int isinf(float x);
int isinf(double x);
intn isinf(floatn x);
longn isinf(doublen x);  // Tests for the i nf i nite value(positive or negative) of x.
int isnan(float x);
int isnan(double x);
intn isnan(floatn x);
longn isnan(doublen x);  // Tests for a NaN.
int isnormal(float x);
int isnormal(double x);
intn isnormal(floatn x);
longn isnormal(doublen x);   // Tests for a normal value(i.e., xis neither zero, denormal, infinite, nor NaN).
int isordered(float x, float y);
int isordered(double x, double y);
intn isordered(floatn x, floatn y);
longn isordered(doublen x, doublen y); // Tests i f arguments are ordered.isorderedtakes arguments x and yand returns the result isequal(x, x) && isequal(y, y)
int isunordered(float x, float y);
int isunordered(double x, double y);
intn isunordered(floatn x, floatn y);
longn isunordered(doublen x, doublen y);
/*
Tests i f arguments are unordered.isunorderedtakes arguments
xand y, returning non - zero if xor yis NaN, and zero otherwise.
*/
int signbit(float x);
int signbit(double x);
intn signbit(floatn x);
longn signbit(doublen x);
/*
Tests for sign bit.The scalar version of the f unction ret urns a 1if
the sign bit in the floating - point value of xis set, else it returns
0. The vector version of the function returns the following for
each component : a - 1if the sign bit in the floating - point value is
set, else 0.
*/

int any(sgentype x); // Returns 1if the most significant bit in any component of xis set; otherwise returns 0.
int all(sgentype x); // Returns 1if the most significant bit in all components of xis set; otherwise returns 0.
gentype bitselect(gentype a, gentype b, gentype c); 
/*
Each bit of the result is the corresponding bit of aif
the corresponding bit of cis 0. Otherwise it is the
corresponding bit of b.
*/
gentype select(gentype a, gentype b, sgentype c);
gentype select(gentype a, gentype b, ugentype c);
/*
For each component of a vector type
result[i] = if MSB of c[i] is set ?
b[i] : a[i]
For a scalar type
result = c ? b : a
sgentypeand ugentypemust have the same
number of elements and bits as gentype
*/






// Synchronization Functions

void barrier(cl_mem_fence_flags flags);








// Vector Data Load and Store Functions
/*
gentype to indicate the scalar built-in data types char,uchar,short,
ushort,int,uint,long,ulong,float, or double. We use the generic
type name gentypento indicate the n-element vectors of gentypeelements. We use the type name floatn,doublen, and halfnto represent
n-element vectors of float,double, and halfelements, respectively. The
suffix nis also used in the function names (such as vloadn,vstoren),
where n= 2,3,4,8, or 16
*/
gentypen vloadn(size_t offset, const global gentype *p);
gentypen vloadn(size_t offset, const local gentype *p);
gentypen vloadn(size_t offset, const constant gentype *p);
gentypen vloadn(size_t offset, const private gentype *p);

gentypen vstoren(gentypen data, size_t offset, global gentype *p);
gentypen vstoren(gentypen data, size_t offset, local gentype *p);
gentypen vstoren(gentypen data, size_t offset, private gentype *p);

float vload_half(size_t offset, const global half *p);
float vload_half(size_t offset, const local half *p);
float vload_half(size_t offset, const constant half *p);
float vload_half(size_t offset, const private half *p);

floatn vload_halfn(size_t offset, const global half *p);
floatn vload_halfn(size_t offset, const local half *p);
floatn vload_halfn(size_t offset, const constant half *p);
floatn vload_halfn(size_t offset, const private half *p);


void vstore_half(float data, size_t offset, global half *p);
void vstore_half_rte(float data, size_t offset, global half *p);
void vstore_half_rtz(float data, size_t offset, global half *p);
void vstore_half_rtp(float data, size_t offset, global half *p);
void vstore_half_rtn(float data, size_t offset, global half *p);
void vstore_half(float data, size_t offset, local half *p);
void vstore_half_rte(float data, size_t offset, local half *p);
void vstore_half_rtz(float data, size_t offset, local half *p);
void vstore_half_rtp(float data, size_t offset, local half *p);
void vstore_half_rtn(float data, size_t offset, local half *p);
void vstore_half(float data, size_t offset, private half *p);
void vstore_half_rte(float data, size_t offset, private half *p);
void vstore_half_rtz(float data, size_t offset, private half *p);
void vstore_half_rtp(float data, size_t offset, private half *p);
void vstore_half_rtn(float data, size_t offset, private half *p);


void vstore_halfn(floatn data, size_t offset, global half *p);
void vstore_halfn_rte(floatn data, size_t offset, global half *p);
void vstore_halfn_rtz(floatn data, size_t offset, global half *p);
void vstore_halfn_rtp(floatn data, size_t offset, global half *p);
void vstore_halfn_rtn(floatn data, size_t offset, global half *p);
void vstore_halfn(floatn data, size_t offset, local half *p);
void vstore_halfn_rte(floatn data, size_t offset, local half *p);
void vstore_halfn_rtz(floatn data, size_t offset, local half *p);
void vstore_halfn_rtp(floatn data, size_t offset, local half *p);
void vstore_halfn_rtn(floatn data, size_t offset, local half *p);
void vstore_halfn(floatn data, size_t offset, private half *p);
void vstore_halfn_rte(floatn data, size_t offset, private half *p);
void vstore_halfn_rtz(floatn data, size_t offset, private half *p);
void vstore_halfn_rtp(floatn data, size_t offset, private half *p);
void vstore_halfn_rtn(floatn data, size_t offset, private half *p);

floatn vloada_halfn(size_t offset, const global half *p);
floatn vloada_halfn(size_t offset, const local half *p);
floatn vloada_halfn(size_t offset, const constant half *p);
floatn vloada_halfn(size_t offset, const private half *p);

void vstorea_halfn(floatn data, size_t offset, global half *p);
void vstorea_halfn_rte(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, global half *p);
void vstorea_halfn(floatn data, size_t offset, local half *p);
void vstorea_halfn_rte(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, local half *p);
void vstorea_halfn(floatn data, size_t offset, private half *p);
void vstorea_halfn_rte(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, private half *p);


void vstore_half(double data, size_t offset, global half *p);
void vstore_half_rte(double data, size_t offset, global half *p);
void vstore_half_rtz(double data, size_t offset, global half *p);
void vstore_half_rtp(double data, size_t offset, global half *p);
void vstore_half_rtn(double data, size_t offset, global half *p);
void vstore_half(double data, size_t offset, local half *p);
void vstore_half_rte(double data, size_t offset, local half *p);
void vstore_half_rtz(double data, size_t offset, local half *p);
void vstore_half_rtp(double data, size_t offset, local half *p);
void vstore_half_rtn(double data, size_t offset, local half *p);
void vstore_half(double data, size_t offset, private half *p);
void vstore_half_rte(double data, size_t offset, private half *p);
void vstore_half_rtz(double data, size_t offset, private half *p);
void vstore_half_rtp(double data, size_t offset, private half *p);
void vstore_half_rtn(double data, size_t offset, private half *p);


void vstore_halfn(doublen data, size_t offset, global half *p);
void vstore_halfn_rte(doublen data, size_t offset, global half *p);
void vstore_halfn_rtz(doublen data, size_t offset, global half *p);
void vstore_halfn_rtp(doublen data, size_t offset, global half *p);
void vstore_halfn_rtn(doublen data, size_t offset, global half *p);
void vstore_halfn(doublen data, size_t offset, local half *p);
void vstore_halfn_rte(doublen data, size_t offset, local half *p);
void vstore_halfn_rtz(doublen data, size_t offset, local half *p);
void vstore_halfn_rtp(doublen data, size_t offset, local half *p);
void vstore_halfn_rtn(doublen data, size_t offset, local half *p);
void vstore_halfn(doublen data, size_t offset, private half *p);
void vstore_halfn_rte(doublen data, size_t offset, private half *p);
void vstore_halfn_rtz(doublen data, size_t offset, private half *p);
void vstore_halfn_rtp(doublen data, size_t offset, private half *p);
void vstore_halfn_rtn(doublen data, size_t offset, private half *p);


void vstorea_halfn(doublen data, size_t offset, global half *p);
void vstorea_halfn_rte(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, global half *p);
void vstorea_halfn(doublen data, size_t offset, local half *p);
void vstorea_halfn_rte(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, local half *p);
void vstorea_halfn(doublen data, size_t offset, private half *p);
void vstorea_halfn_rte(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, private half *p);





// Async Copy and Prefetch Functions

event_t async_work_group_copy(local gentype *dst,const global gentype *src,size_t num_gentypes,event_t event);
event_t async_work_group_copy(global gentype *dst,const local gentype *src,size_t num_gentypes,event_t event);
event_t async_work_group_strided_copy(local gentype *dst, const global gentype *src, size_t num_gentypes, size_t src_stride, event_t event);
event_t async_work_group_strided_copy(global gentype *dst,const local gentype *src,size_t num_gentypes,size_t dst_stride,event_t event);
void wait_group_events(int num_events,event_t *event_list);
void prefetch(const global gentype *p,size_t num_gentypes);




// Atomic Functions

int atomic_add(volatile global int *p, int val);
unsigned int atomic_add(volatile global unsigned int *p,unsigned int val);
int atomic_add(volatile local int *p, int val);
unsigned int atomic_add(volatile local unsigned int *p,unsigned int val);

int atomic_sub(volatile global int *p, int val);
unsigned int atomic_sub(volatile global unsigned int *p, unsigned int val);
int atomic_sub(volatile local int *p, int val);
unsigned int atomic_sub(volatile local unsigned int *p, unsigned int val);

int atomic_xchg(volatile global int *p, int val);
unsigned int atomic_xchg(volatile global unsigned int *p,unsigned int val);
float atomic_xchg(volatile global int *p, float val);
int atomic_xchg(volatile local int *p, int val);
unsigned int atomic_xchg(volatile local unsigned int *p,  unsigned int val);
float atomic_xchg(volatile local int *p, float val);


int atomic_inc(volatile global int *p);
unsigned int atomic_inc(volatile global unsigned int *p);
int atomic_inc(volatile local int *p);
unsigned int atomic_inc(volatile local unsigned int *p);


int atomic_dec(volatile global int *p);
unsigned int atomic_dec(volatile global unsigned int *p);
int atomic_dec(volatile local int *p);
unsigned int atomic_dec(volatile local unsigned int *p);


int atomic_cmpxchg(volatile global int *p, int cmp, int val);
unsigned int atomic_cmpxchg(volatile global unsigned int *p, unsigned int cmp, unsigned int val);
int atomic_cmpxchg(volatile local int *p, int cmp, int val);
unsigned int atomic_cmpxchg(volatile local unsigned int *p, unsigned int cmp, unsigned int val);


int atomic_min(volatile global int *p, int val);
unsigned int atomic_min(volatile global unsigned int *p, unsigned int val);
int atomic_min(volatile local int *p, int val);
unsigned int atomic_min(volatile local unsigned int *p, unsigned int val);


int atomic_max(volatile global int *p, int val);
unsigned int atomic_max(volatile global unsigned int *p, unsigned int val) int atomic_max(volatile local int *p, int val);
unsigned int atomic_max(volatile local unsigned int *p, unsigned int val);


int atomic_min(volatile global int *p, int val);
unsigned int atomic_min(volatile global unsigned int *p, unsigned int val);
int atomic_min(volatile local int *p, int val);
unsigned int atomic_min(volatile local unsigned int *p, unsigned int val);

int atomic_and(volatile global int *p, int val);
unsigned int atomic_and(volatile global unsigned int *p, unsigned int val);
int atomic_and(volatile local int *p, int val);
unsigned int atomic_and(volatile local unsigned int *p, unsigned int val);


int atomic_or(volatile global int *p, int val);
unsigned int atomic_or(volatile global unsigned int *p,unsigned int val);
int atomic_or(volatile local int *p, int val);
unsigned int atomic_or(volatile local unsigned int *p, unsigned int val);


int atomic_xor(volatile global int *p, int val);
unsigned int atomic_xor(volatile global unsigned int *p, unsigned int val);
int atomic_xor(volatile local int *p, int val);
unsigned int atomic_xor(volatile local unsigned int *p, unsigned int val);


// Miscellaneous Vector Functions

int vec_step(gentype a);
int vec_step(gentypen a);
int vec_step(char3 a);
int vec_step(uchar3 a);
int vec_step(short3 a);
int vec_step(ushort3 a);
int vec_step(half3 a);
int vec_step(int3 a);
int vec_step(uint3 a);
int vec_step(long3 a);
int vec_step(ulong3 a);
int vec_step(float3 a);
int vec_step(double3 a);
int vec_step(type a);

gentypen shuffle(gentypem x, ugentypen mask);
gentypen shuffle2(gentypem x, gentypem y, ugentypen mask);




// Image Read and Write Functions

float4 read_imagef(image2d_t image,  sampler_t sampler, float2 coord);
float4 read_imagef(image2d_t image,  sampler_t sampler, int2 coord);
int4 read_imagei(image2d_t image,  sampler_t sampler, float2 coord);
int4 read_imagei(image2d_t image,  sampler_t sampler, int2 coord);
uint4 read_imageui(image2d_t image, sampler_t sampler,  float2 coord);
uint4 read_imageui(image2d_t image,  sampler_t sampler,  int2 coord);
float4 read_imagef(image3d_t image,  sampler_t sampler,  float4 coord);
float4 read_imagef(image3d_t image,  sampler_t sampler, int4 coord);
int4 read_imagei(image3d_t image,  sampler_t sampler,  float4 coord);
int4 read_imagei(image3d_t image, sampler_t sampler, int4 coord);
uint4 read_imageui(image3d_t image,  sampler_t sampler,  float4 coord);
uint4 read_imageui(image3d_t image,  sampler_t sampler, int4 coord);
void write_imagef(image2d_t image,  int2 coord,  float4 color);
void write_imagei(image2d_t image, int2 coord,  int4 color);
void write_imageui(image2d_t image, int2 coord, uint4 color);
void write_imagef(image3d_t image, int4 coord, float4 color);
void write_imagei(image3d_t image, int4 coord,  int4 color);
void write_imageui(image3d_t image, int4 coord, uint4 color);

int get_image_width(image2d_t image);
int get_image_width(image3d_t image); // Returns the image width in pixels.
int get_image_height(image2d_t image);
int get_image_height(image3d_t image);   // Returns the image height in pixels.
int get_image_depth(image3d_t image);    // Returns the image depth in pixels.
int2 get_image_dim(image2d_t image);     // Returns the 2D image dimensions in an int2.The width is returned in the x component and the height in the y component.
int4 get_image_dim(image3d_t image);  // Returns the 3D image dimensions in an int4.The width is returned in the x component, the height in the ycomponent, and the depth in the z component.

int get_image_channel_data_type(image2d_t image);
int get_image_channel_data_type(image3d_t image);   /* Returns the channel data type of the image.Valid values are
                                                        CLK_SNORM_INT8
                                                        CLK_SNORM_INT16
                                                        CLK_UNORM_INT8
                                                        CLK_UNORM_INT16
                                                        CLK_UNORM_SHORT_565
                                                        CLK_UNORM_SHORT_555
                                                        CLK_UNORM_SHORT_101010
                                                        CLK_SIGNED_INT8
                                                        CLK_SIGNED_INT16
                                                        CLK_SIGNED_INT32
                                                        CLK_UNSIGNED_INT8
                                                        CLK_UNSIGNED_INT16
                                                        CLK_UNSIGNED_INT32
                                                        CLK_HALF_FLOAT
                                                        CLK_FLOAT
                                                        */
int get_image_channel_data_order(image2d_t image);
int get_image_channel_data_order(image3d_t image);    /*  Returns the image channel order.Valid values are
                                                        CLK_A
                                                        CLK_R
                                                        CLK_Rx
                                                        CLK_RG
                                                        CLK_RGx
                                                        CLK_RGB
                                                        CLK_RGBx
                                                        CLK_RGBA
                                                        CLK_ARGB
                                                        CLK_BGRA
                                                        CLK_INTENSITY
                                                        CLK_LUMINANCE
                                                        */