aboutsummaryrefslogtreecommitdiff
path: root/include/clc.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/clc.h')
-rw-r--r--include/clc.h1939
1 files changed, 1939 insertions, 0 deletions
diff --git a/include/clc.h b/include/clc.h
new file mode 100644
index 0000000..c6e6b5d
--- /dev/null
+++ b/include/clc.h
@@ -0,0 +1,1939 @@
+/*
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+ * Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CLC_H_
+#define _CLC_H_
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define _CLC_PROTECTED __attribute__((visibility("protected")))
+#define _CLC_OVERLOAD __attribute__((overloadable))
+#define _CLC_DECL __attribute__((visibility("protected")))
+#define _CLC_DEF __attribute__((visibility("protected")))
+#define _CLC_INLINE __attribute__((always_inline)) inline
+
+#define UNARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x);\
+
+#define BINARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type##2 y); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type##3 y); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type##4 y); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type##8 y); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type##16 y);\
+
+#define BINARY_VEC_DECL_ALT(type,utype,type2,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type2##2 y); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type2##3 y); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type2##4 y); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type2##8 y); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type2##16 y);\
+
+#define TERNARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type##2 y, type##2 z); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type##3 y, type##3 z); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type##4 y, type##4 z); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type##8 y, type##8 z); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type##16 y,type##16 z);\
+
+#define UNARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x) { return op(x); }
+
+#define BINARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x, type y); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type y) { return op(x, y); }
+
+#define BINARY_INLINE_ALT(type,utype,type2,name,op) \
+_CLC_PROTECTED utype op(type x, type2 y); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type2 y) { return op(x, y); }
+
+#define BINARY_INLINE_ALT2(type,utype,type2,name,op) \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type2 y) { return op(x, (type)y); }
+
+#define TERNARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x, type y, type z); \
+ _CLC_OVERLOAD _CLC_INLINE utype name(type x, type y, type z) { return op(x, y, z); }
+
+#define UNARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x) \
+{ return (utype##2) (op(x.s0), op(x.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x) \
+{ return (utype##3) (op(x.s0), op(x.s1), op(x.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x) \
+{ return (utype##4) (op(x.s0), op(x.s1), op(x.s2), op(x.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x) \
+{ return (utype##8) (op(x.s0), op(x.s1), op(x.s2), op(x.s3),\
+ op(x.s4), op(x.s5), op(x.s6), op(x.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x) \
+{ return (utype##16) (op(x.s0), op(x.s1), op(x.s2), op(x.s3),\
+ op(x.s4), op(x.s5), op(x.s6), op(x.s7),\
+ op(x.s8), op(x.s9), op(x.sa), op(x.sb),\
+ op(x.sc), op(x.sd), op(x.se), op(x.sf)); }
+
+#define BINARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type##2 y) \
+{ return (utype##2) (op(x.s0,y.s0), op(x.s1,y.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type##3 y) \
+{ return (utype##3) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type##4 y) \
+{ return (utype##4) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type##8 y) \
+{ return (utype##8) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type##16 y) \
+{ return (utype##16) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7),\
+ op(x.s8,y.s8), op(x.s9,y.s9), op(x.sa,y.sa), op(x.sb,y.sb),\
+ op(x.sc,y.sc), op(x.sd,y.sd), op(x.se,y.se), op(x.sf,y.sf)); }
+
+#define BINARY_VEC_DEF_ALT(type,utype,type2,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type2##2 y) \
+{ return (utype##2) (op(x.s0,y.s0), op(x.s1,y.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type2##3 y) \
+{ return (utype##3) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type2##4 y) \
+{ return (utype##4) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type2##8 y) \
+{ return (utype##8) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type2##16 y) \
+{ return (utype##16) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7),\
+ op(x.s8,y.s8), op(x.s9,y.s9), op(x.sa,y.sa), op(x.sb,y.sb),\
+ op(x.sc,y.sc), op(x.sd,y.sd), op(x.se,y.se), op(x.sf,y.sf)); }
+
+#define TERNARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type##2 y, type##2 z) \
+{ return (utype##2) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type##3 y, type##3 z) \
+{ return (utype##3) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), op(x.s2,y.s2,z.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type##4 y, type##4 z) \
+{ return (utype##4) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type##8 y, type##8 z) \
+{ return (utype##8) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3),\
+ op(x.s4,y.s4,z.s4), op(x.s5,y.s5,z.s5), \
+ op(x.s6,y.s6,z.s6), op(x.s7,y.s7,z.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type##16 y, type##16 z) \
+{ return (utype##16) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3),\
+ op(x.s4,y.s4,z.s4), op(x.s5,y.s5,z.s5), \
+ op(x.s6,y.s6,z.s6), op(x.s7,y.s7,z.s7),\
+ op(x.s8,y.s8,z.s8), op(x.s9,y.s9,z.s9), \
+ op(x.sa,y.sa,z.sa), op(x.sb,y.sb,z.sb),\
+ op(x.sc,y.sc,z.sc), op(x.sd,y.sd,z.sd), \
+ op(x.se,y.se,z.se), op(x.sf,y.sf,z.sf)); }
+
+
+#define _VEC_TYPE(type,sz) type##sz
+
+#define _EXPAND_TYPES() \
+ EXPAND_SIZES(char) \
+ EXPAND_SIZES(uchar) \
+ EXPAND_SIZES(short) \
+ EXPAND_SIZES(ushort) \
+ EXPAND_SIZES(int) \
+ EXPAND_SIZES(uint) \
+ EXPAND_SIZES(long) \
+ EXPAND_SIZES(ulong) \
+ EXPAND_SIZES(float) \
+ EXPAND_SIZES(double)
+
+#define _EXPAND_INTEGER_TYPES() \
+ EXPAND_SIZES(char) \
+ EXPAND_SIZES(uchar) \
+ EXPAND_SIZES(short) \
+ EXPAND_SIZES(ushort) \
+ EXPAND_SIZES(int) \
+ EXPAND_SIZES(uint) \
+ EXPAND_SIZES(long) \
+ EXPAND_SIZES(ulong)
+
+typedef unsigned int cl_mem_fence_flags;
+
+/*-----------------------------------------------------------------------------
+* Standard types from Clang's stddef and stdint, Copyright (C) 2008 Eli Friedman
+*----------------------------------------------------------------------------*/
+typedef __INT64_TYPE__ int64_t;
+typedef __UINT64_TYPE__ uint64_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __UINT16_TYPE__ uint16_t;
+typedef __INT8_TYPE__ int8_t;
+typedef __UINT8_TYPE__ uint8_t;
+
+#define __stdint_join3(a,b,c) a ## b ## c
+#define __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __intn_t(__INTPTR_WIDTH__) intptr_t;
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+
+#undef __stdint_join3
+#undef __intn_t
+#undef __uintn_t
+
+/*-----------------------------------------------------------------------------
+* OpenCL types
+*----------------------------------------------------------------------------*/
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+typedef uint64_t ulong;
+
+#if defined(CLANG_OLDER_THAN_3_3)
+typedef unsigned int sampler_t;
+typedef struct image2d *image2d_t;
+typedef struct image3d *image3d_t;
+#endif
+
+/*-----------------------------------------------------------------------------
+* Vectors
+*----------------------------------------------------------------------------*/
+#define COAL_VECTOR(type, len) \
+ typedef type type##len __attribute__((ext_vector_type(len)))
+
+#define COAL_VECTOR_SET(type) \
+ COAL_VECTOR(type, 2); \
+ COAL_VECTOR(type, 3); \
+ COAL_VECTOR(type, 4); \
+ COAL_VECTOR(type, 8); \
+ COAL_VECTOR(type, 16);
+
+COAL_VECTOR_SET(char)
+COAL_VECTOR_SET(uchar)
+COAL_VECTOR_SET(short)
+COAL_VECTOR_SET(ushort)
+COAL_VECTOR_SET(int)
+COAL_VECTOR_SET(uint)
+COAL_VECTOR_SET(long)
+COAL_VECTOR_SET(ulong)
+COAL_VECTOR_SET(float)
+COAL_VECTOR_SET(double)
+
+#undef COAL_VECTOR_SET
+#undef COAL_VECTOR
+
+#define CL_VERSION_1_0 100
+#define CL_VERSION_1_1 110
+#define __OPENCL_VERSION__ 110
+#define __ENDIAN_LITTLE__ 1
+#define __kernel_exec(X, typen) __kernel __attribute__((work_group_size_hint(X, 1, 1))) \
+ __attribute__((vec_type_hint(typen)))
+#define kernel_exec __kernel_exec
+
+#define __write_only
+#define __read_only const
+
+#define write_only __write_only
+#define read_only __read_only
+
+#define CLK_NORMALIZED_COORDS_FALSE 0x00000000
+#define CLK_NORMALIZED_COORDS_TRUE 0x00000001
+#define CLK_ADDRESS_NONE 0x00000000
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010
+#define CLK_ADDRESS_REPEAT 0x00000020
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x00000030
+#define CLK_ADDRESS_CLAMP 0x00000040
+#define CLK_FILTER_NEAREST 0x00000000
+#define CLK_FILTER_LINEAR 0x00000100
+#define CLK_LOCAL_MEM_FENCE 0x00000001
+#define CLK_GLOBAL_MEM_FENCE 0x00000002
+#define CLK_R 0x10B0
+#define CLK_A 0x10B1
+#define CLK_RG 0x10B2
+#define CLK_RA 0x10B3
+#define CLK_RGB 0x10B4
+#define CLK_RGBA 0x10B5
+#define CLK_BGRA 0x10B6
+#define CLK_ARGB 0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx 0x10BA
+#define CLK_RGx 0x10BB
+#define CLK_RGBx 0x10BC
+#define CLK_SNORM_INT8 0x10D0
+#define CLK_SNORM_INT16 0x10D1
+#define CLK_UNORM_INT8 0x10D2
+#define CLK_UNORM_INT16 0x10D3
+#define CLK_UNORM_SHORT_565 0x10D4
+#define CLK_UNORM_SHORT_555 0x10D5
+#define CLK_UNORM_INT_101010 0x10D6
+#define CLK_SIGNED_INT8 0x10D7
+#define CLK_SIGNED_INT16 0x10D8
+#define CLK_SIGNED_INT32 0x10D9
+#define CLK_UNSIGNED_INT8 0x10DA
+#define CLK_UNSIGNED_INT16 0x10DB
+#define CLK_UNSIGNED_INT32 0x10DC
+#define CLK_HALF_FLOAT 0x10DD
+#define CLK_FLOAT 0x10DE
+
+_CLC_PROTECTED void barrier (cl_mem_fence_flags flags);
+_CLC_PROTECTED void mem_fence (cl_mem_fence_flags flags);
+_CLC_PROTECTED void read_mem_fence (cl_mem_fence_flags flags);
+_CLC_PROTECTED void write_mem_fence (cl_mem_fence_flags flags);
+
+/******************************************************************************
+* AS_<type> functions
+******************************************************************************/
+#define as_char(x) __builtin_astype(x, char)
+#define as_uchar(x) __builtin_astype(x, uchar)
+#define as_short(x) __builtin_astype(x, short)
+#define as_ushort(x) __builtin_astype(x, ushort)
+#define as_int(x) __builtin_astype(x, int)
+#define as_uint(x) __builtin_astype(x, uint)
+#define as_long(x) __builtin_astype(x, long)
+#define as_ulong(x) __builtin_astype(x, ulong)
+#define as_float(x) __builtin_astype(x, float)
+#define as_double(x) __builtin_astype(x, double)
+
+#define as_char2(x) __builtin_astype(x, char2)
+#define as_uchar2(x) __builtin_astype(x, uchar2)
+#define as_short2(x) __builtin_astype(x, short2)
+#define as_ushort2(x) __builtin_astype(x, ushort2)
+#define as_int2(x) __builtin_astype(x, int2)
+#define as_uint2(x) __builtin_astype(x, uint2)
+#define as_long2(x) __builtin_astype(x, long2)
+#define as_ulong2(x) __builtin_astype(x, ulong2)
+#define as_float2(x) __builtin_astype(x, float2)
+#define as_double2(x) __builtin_astype(x, double2)
+
+#define as_char3(x) __builtin_astype(x, char3)
+#define as_uchar3(x) __builtin_astype(x, uchar3)
+#define as_short3(x) __builtin_astype(x, short3)
+#define as_ushort3(x) __builtin_astype(x, ushort3)
+#define as_int3(x) __builtin_astype(x, int3)
+#define as_uint3(x) __builtin_astype(x, uint3)
+#define as_long3(x) __builtin_astype(x, long3)
+#define as_ulong3(x) __builtin_astype(x, ulong3)
+#define as_float3(x) __builtin_astype(x, float3)
+#define as_double3(x) __builtin_astype(x, double3)
+
+#define as_char4(x) __builtin_astype(x, char4)
+#define as_uchar4(x) __builtin_astype(x, uchar4)
+#define as_short4(x) __builtin_astype(x, short4)
+#define as_ushort4(x) __builtin_astype(x, ushort4)
+#define as_int4(x) __builtin_astype(x, int4)
+#define as_uint4(x) __builtin_astype(x, uint4)
+#define as_long4(x) __builtin_astype(x, long4)
+#define as_ulong4(x) __builtin_astype(x, ulong4)
+#define as_float4(x) __builtin_astype(x, float4)
+#define as_double4(x) __builtin_astype(x, double4)
+
+#define as_char8(x) __builtin_astype(x, char8)
+#define as_uchar8(x) __builtin_astype(x, uchar8)
+#define as_short8(x) __builtin_astype(x, short8)
+#define as_ushort8(x) __builtin_astype(x, ushort8)
+#define as_int8(x) __builtin_astype(x, int8)
+#define as_uint8(x) __builtin_astype(x, uint8)
+#define as_long8(x) __builtin_astype(x, long8)
+#define as_ulong8(x) __builtin_astype(x, ulong8)
+#define as_float8(x) __builtin_astype(x, float8)
+#define as_double8(x) __builtin_astype(x, double8)
+
+#define as_char16(x) __builtin_astype(x, char16)
+#define as_uchar16(x) __builtin_astype(x, uchar16)
+#define as_short16(x) __builtin_astype(x, short16)
+#define as_ushort16(x) __builtin_astype(x, ushort16)
+#define as_int16(x) __builtin_astype(x, int16)
+#define as_uint16(x) __builtin_astype(x, uint16)
+#define as_long16(x) __builtin_astype(x, long16)
+#define as_ulong16(x) __builtin_astype(x, ulong16)
+#define as_float16(x) __builtin_astype(x, float16)
+#define as_double16(x) __builtin_astype(x, double16)
+
+#define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_OVERLOAD _CLC_DECL TO_TYPE convert_##TO_TYPE##SUFFIX(FROM_TYPE x);
+
+#define _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##2, TO_TYPE##2, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##3, TO_TYPE##3, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##4, TO_TYPE##4, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##8, TO_TYPE##8, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##16, TO_TYPE##16, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, char, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uchar, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, int, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uint, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, short, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ushort, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, long, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(char, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(uchar, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(int, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(uint, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(short, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(ushort, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(long, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(float, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
+ _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO_SUFFIX(ROUND) \
+ _CLC_VECTOR_CONVERT_TO(_sat##ROUND) \
+ _CLC_VECTOR_CONVERT_TO(ROUND)
+
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtn)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rte)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtz)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtp)
+_CLC_VECTOR_CONVERT_TO_SUFFIX()
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) \
+ { return (PRIM_TYPE##2)(x[offset<<1] , x[1+(offset<<1)]); } \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+ VLOAD_ADDR_SPACES(double)\
+
+VLOAD_TYPES()
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) \
+ { mem[offset<<1] = vec.s0; mem[1+(offset<<1)] = vec.s1; } \
+ _CLC_OVERLOAD _CLC_DECL void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+ VSTORE_ADDR_SPACES(char) \
+ VSTORE_ADDR_SPACES(uchar) \
+ VSTORE_ADDR_SPACES(short) \
+ VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
+ VSTORE_ADDR_SPACES(long) \
+ VSTORE_ADDR_SPACES(ulong) \
+ VSTORE_ADDR_SPACES(float) \
+ VSTORE_ADDR_SPACES(double) \
+
+VSTORE_TYPES()
+
+#undef VLOAD_VECTORIZE
+#undef VLOAD_ADDR_SPACES
+#undef VLOAD_TYPES
+#undef VSTORE_VECTORIZE
+#undef VSTORE_ADDR_SPACES
+#undef VSTORE_TYPES
+
+/*-----------------------------------------------------------------------------
+* Relational
+*----------------------------------------------------------------------------*/
+#define INLN(type) \
+_CLC_OVERLOAD _CLC_INLINE type bitselect(type a, type b, type c) { return a^(c&(b^a)); }
+
+#define DECL(type) \
+_CLC_OVERLOAD _CLC_DECL type bitselect(type a, type b, type c);
+
+INLN(char)
+INLN(uchar)
+INLN(short)
+INLN(ushort)
+INLN(int)
+INLN(uint)
+INLN(long)
+INLN(ulong)
+
+DECL(char2)
+DECL(uchar2)
+INLN(short2)
+INLN(ushort2)
+INLN(int2)
+INLN(uint2)
+DECL(long2)
+DECL(ulong2)
+
+DECL(char3)
+DECL(uchar3)
+DECL(short3)
+DECL(ushort3)
+DECL(int3)
+DECL(uint3)
+DECL(long3)
+DECL(ulong3)
+
+INLN(char4)
+INLN(uchar4)
+INLN(short4)
+INLN(ushort4)
+DECL(int4)
+DECL(uint4)
+DECL(long4)
+DECL(ulong4)
+
+INLN(char8)
+INLN(uchar8)
+DECL(short8)
+DECL(ushort8)
+DECL(int8)
+DECL(uint8)
+DECL(long8)
+DECL(ulong8)
+
+DECL(char16)
+DECL(uchar16)
+DECL(short16)
+DECL(ushort16)
+DECL(int16)
+DECL(uint16)
+DECL(long16)
+DECL(ulong16)
+
+DECL(float)
+DECL(float2)
+DECL(float3)
+DECL(float4)
+DECL(float8)
+DECL(float16)
+
+DECL(double)
+DECL(double2)
+DECL(double3)
+DECL(double4)
+DECL(double8)
+DECL(double16)
+
+#undef INLN
+#undef DECL
+
+#define EXTU(x,l,r) (((x) << l) >> r)
+
+#define SIGND(x) (as_uint2(x).hi >> 31)
+#define EXPD(x) EXTU(as_uint2(x).hi, 1, 21)
+#define MANTD_HI(x) EXTU(as_uint2(x).hi, 12, 12)
+#define MANTD_LO(x) as_uint2(x).lo
+#define MANTD_ZERO(x) (MANTD_HI(x) == 0 && MANTD_LO(x) == 0)
+#define ANY_ZEROD(x) ((as_ulong(x) << 1) == 0)
+#define SUBNORMD(x) (EXPD(x) == 0 && !MANTD_ZERO(x))
+
+#define FABSF(x) ((as_uint(x) << 1) >> 1)
+#define SIGNF(x) (as_uint(x) >> 31)
+#define EXPF(x) ((as_uint(x) << 1) >> 24)
+#define MANTF(x) ((as_uint(x) << 9) >> 9)
+
+#define isordered(x,y) (!isnan(x) & !isnan(y))
+#define isunordered(x,y) (isnan(x) | isnan(y))
+
+_CLC_OVERLOAD _CLC_INLINE int isnan(float x) { return FABSF(x) > 0x7F800000; }
+UNARY_INLINE (double, int, isnan, __builtin_isnan)
+UNARY_VEC_DECL(float, int, isnan)
+UNARY_VEC_DECL(double, long, isnan)
+
+_CLC_OVERLOAD _CLC_INLINE int isfinite(float x) { return EXPF(x) != 255; }
+UNARY_INLINE (double, int, isfinite, __builtin_isfinite)
+UNARY_VEC_DECL(float, int, isfinite)
+UNARY_VEC_DECL(double, long, isfinite)
+
+_CLC_OVERLOAD _CLC_INLINE int isinf(float x) { return FABSF(x) == 0x7F800000; }
+UNARY_INLINE (double, int, isinf, __builtin_isinf)
+UNARY_VEC_DECL(float, int, isinf)
+UNARY_VEC_DECL(double, long, isinf)
+
+_CLC_OVERLOAD _CLC_INLINE int isnormal(float x) { return EXPF(x) != 0 && EXPF(x) != 255; }
+UNARY_INLINE (double, int, isnormal, __builtin_isnormal)
+UNARY_VEC_DECL(float, int, isnormal)
+UNARY_VEC_DECL(double, long, isnormal)
+
+_CLC_OVERLOAD _CLC_INLINE int signbit(float x) { return SIGNF(x); }
+_CLC_OVERLOAD _CLC_INLINE int signbit(double x) { return SIGND(x); }
+UNARY_VEC_DECL(float, int, signbit)
+UNARY_VEC_DECL(double, long, signbit)
+
+_CLC_OVERLOAD _CLC_INLINE float copysign(float x, float y)
+ { return as_float(FABSF(x) | (SIGNF(y) << 31)); }
+
+_CLC_OVERLOAD _CLC_INLINE double copysign(double x, double y)
+{ return as_double(((as_ulong(x) << 1) >> 1) | ((as_ulong(y) >> 63) << 63)); }
+
+BINARY_VEC_DECL(float, float, copysign)
+BINARY_VEC_DECL(double, double, copysign)
+
+_CLC_OVERLOAD _CLC_INLINE int isequal(float x, float y) { return x == y; }
+_CLC_OVERLOAD _CLC_INLINE int isequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x == y;
+
+}
+BINARY_VEC_DECL(float, int, isequal)
+BINARY_VEC_DECL(double, long, isequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isnotequal(float x, float y) { return x != y; }
+_CLC_OVERLOAD _CLC_INLINE int isnotequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 1;
+ else return x != y;
+
+}
+BINARY_VEC_DECL(float, int, isnotequal)
+BINARY_VEC_DECL(double, long, isnotequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isless(float x, float y) { return x < y; }
+_CLC_OVERLOAD _CLC_INLINE int isless(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x < y;
+}
+BINARY_VEC_DECL(float, int, isless)
+BINARY_VEC_DECL(double, long, isless)
+
+_CLC_OVERLOAD _CLC_INLINE int islessequal(float x, float y) { return x <= y; }
+_CLC_OVERLOAD _CLC_INLINE int islessequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x <= y;
+}
+BINARY_VEC_DECL(float, int, islessequal)
+BINARY_VEC_DECL(double, long, islessequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isgreater(float x, float y) { return x > y; }
+_CLC_OVERLOAD _CLC_INLINE int isgreater(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x > y;
+}
+BINARY_VEC_DECL(float, int, isgreater)
+BINARY_VEC_DECL(double, long, isgreater)
+
+_CLC_OVERLOAD _CLC_INLINE int isgreaterequal(float x, float y) { return x >= y; }
+_CLC_OVERLOAD _CLC_INLINE int isgreaterequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x >= y;
+}
+BINARY_VEC_DECL(float, int, isgreaterequal)
+BINARY_VEC_DECL(double, long, isgreaterequal)
+
+_CLC_OVERLOAD _CLC_INLINE int islessgreater(float x, float y)
+{ return isless(x,y) | isgreater(x, y); }
+_CLC_OVERLOAD _CLC_INLINE int islessgreater(double x, double y)
+{ return isless(x,y) | isgreater(x, y); }
+BINARY_VEC_DECL(float, int, islessgreater)
+BINARY_VEC_DECL(double, long, islessgreater)
+
+#undef EXPD
+#undef MANTD_HI
+#undef MANTD_LO
+#undef MANTD_ZERO
+#undef SIGND
+#undef FABSF
+#undef SIGNF
+#undef EXPF
+#undef MANTF
+#undef EXTU
+
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_INLINE int any(type x) { return x < 0; } \
+_CLC_OVERLOAD _CLC_INLINE int any(type##2 x) { return (x.s0 | x.s1) < 0; } \
+_CLC_OVERLOAD _CLC_DECL int any(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##16 x); \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
+
+#undef TEMPLATE
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_INLINE int all(type x) { return x < 0; } \
+_CLC_OVERLOAD _CLC_INLINE int all(type##2 x) { return (x.s0 & x.s1) < 0; } \
+_CLC_OVERLOAD _CLC_DECL int all(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##16 x); \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
+
+#undef TEMPLATE
+
+#define DEFINE(type, otype) \
+_CLC_OVERLOAD _CLC_INLINE type select(type a, type b, otype c) { return c ? b : a; }
+
+DEFINE(char, char)
+DEFINE(char, uchar)
+DEFINE(uchar, char)
+DEFINE(uchar, uchar)
+DEFINE(short, short)
+DEFINE(short, ushort)
+DEFINE(ushort, short)
+DEFINE(ushort, ushort)
+DEFINE(int, int)
+DEFINE(int, uint)
+DEFINE(uint, int)
+DEFINE(uint, uint)
+DEFINE(long, long)
+DEFINE(long, ulong)
+DEFINE(ulong, long)
+DEFINE(ulong, ulong)
+DEFINE(float, int)
+DEFINE(float, uint)
+DEFINE(double, long)
+DEFINE(double, ulong)
+
+#undef DEFINE
+
+#define DECLARATION(type, itype, utype) \
+_CLC_OVERLOAD _CLC_DECL type select(type a, type b, itype c);\
+_CLC_OVERLOAD _CLC_DECL type select(type a, type b, utype c);
+
+#define SELECT_EXPAND_SIZES(type,itype,utype) \
+ DECLARATION(_VEC_TYPE(type,2), _VEC_TYPE(itype,2), _VEC_TYPE(utype,2)) \
+ DECLARATION(_VEC_TYPE(type,3), _VEC_TYPE(itype,3), _VEC_TYPE(utype,3)) \
+ DECLARATION(_VEC_TYPE(type,4), _VEC_TYPE(itype,4), _VEC_TYPE(utype,4)) \
+ DECLARATION(_VEC_TYPE(type,8), _VEC_TYPE(itype,8), _VEC_TYPE(utype,8)) \
+ DECLARATION(_VEC_TYPE(type,16), _VEC_TYPE(itype,16), _VEC_TYPE(utype,16)) \
+
+#define SELECT_EXPAND_TYPES \
+ SELECT_EXPAND_SIZES(char, char, uchar) \
+ SELECT_EXPAND_SIZES(uchar, char, uchar) \
+ SELECT_EXPAND_SIZES(short, short, ushort) \
+ SELECT_EXPAND_SIZES(ushort, short, ushort) \
+ SELECT_EXPAND_SIZES(int, int, uint) \
+ SELECT_EXPAND_SIZES(uint, int, uint) \
+ SELECT_EXPAND_SIZES(long, long, ulong) \
+ SELECT_EXPAND_SIZES(ulong, long, ulong) \
+ SELECT_EXPAND_SIZES(float, int, uint) \
+ SELECT_EXPAND_SIZES(double, long, ulong)
+
+SELECT_EXPAND_TYPES
+
+#undef DECLARATION
+#undef SELECT_EXPAND_SIZES
+#undef SELECT_EXPAND_TYPES
+
+/*-----------------------------------------------------------------------------
+* Math
+*----------------------------------------------------------------------------*/
+#define CHAR_BIT 8
+#define CHAR_MAX SCHAR_MAX
+#define CHAR_MIN SCHAR_MIN
+#define INT_MAX 2147483647
+#define INT_MIN (-2147483647 - 1)
+#define LONG_MAX 0x7fffffffffffffffL
+#define LONG_MIN (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-127 - 1)
+#define SHRT_MAX 32767
+#define SHRT_MIN (-32767 - 1)
+#define UCHAR_MAX 255
+#define USHRT_MAX 65535
+#define UINT_MAX 0xffffffff
+#define ULONG_MAX 0xffffffffffffffffUL
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E 2.7182818284590452354 /* e */
+#define M_LOG2E 1.4426950408889634074 /* log_2 e */
+#define M_LOG10E 0.43429448190325182765 /* log_10 e */
+#define M_LN2 0.69314718055994530942 /* log_e 2 */
+#define M_LN10 2.30258509299404568402 /* log_e 10 */
+#define M_PI 3.14159265358979323846 /* pi */
+#define M_PI_2 1.57079632679489661923 /* pi/2 */
+#define M_PI_4 0.78539816339744830962 /* pi/4 */
+#define M_1_PI 0.31830988618379067154 /* 1/pi */
+#define M_2_PI 0.63661977236758134308 /* 2/pi */
+#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
+#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
+#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+#define M_E_F M_E
+#define M_LOG2E_F M_LOG2E
+#define M_LOG10E_F M_LOG10E
+#define M_LN2_F M_LN2
+#define M_LN10_F M_LN10
+#define M_PI_F M_PI
+#define M_PI_2_F M_PI_2
+#define M_PI_4_F M_PI_4
+#define M_1_PI_F M_1_PI
+#define M_2_PI_F M_2_PI
+#define M_2_SQRTPI_F M_2_SQRTPI
+#define M_SQRT2_F M_SQRT2
+#define M_SQRT1_2_F M_SQRT1_2
+
+#define MAXFLOAT FLT_MAX
+#define HUGE_VALF __builtin_huge_valf()
+#define INFINITY (1.0f / 0.0f)
+#define NAN (0.0f / 0.0f)
+
+#define HUGE_VAL __builtin_huge_val()
+
+#define FP_ILOGB0 (-INT_MAX)
+#define FP_ILOGBNAN (INT_MAX)
+
+#define UNARY(function) \
+UNARY_INLINE (float, float, function, function##f)\
+UNARY_INLINE (double, double, function, function##d)\
+UNARY_VEC_DECL(float, float, function)\
+UNARY_VEC_DECL(double, double, function)\
+
+#define UNARYT(type1, type2, function,op) \
+UNARY_INLINE (type1, type2, function, op)\
+UNARY_VEC_DECL(type1, type2, function)\
+
+#define UNARYT_ALT(type1, type2, function, op) \
+UNARY_INLINE (type1, type2, function, op) \
+UNARY_VEC_DECL(type1, type2, function) \
+
+#define BINARY(function) \
+BINARY_INLINE (float, float, function, function##f)\
+BINARY_INLINE (double, double, function, function##d)\
+BINARY_VEC_DECL(float, float, function)\
+BINARY_VEC_DECL(double, double, function)\
+
+#define TERNARY(function) \
+TERNARY_INLINE (float, float, function, function##f)\
+TERNARY_INLINE (double, double, function, function##d)\
+TERNARY_VEC_DECL(float, float, function)\
+TERNARY_VEC_DECL(double, double, function)\
+
+/*-------------------------------------------------------------------------
+* Prototypes for the math builtins
+*------------------------------------------------------------------------*/
+UNARY(acos)
+UNARY(acosh)
+
+_CLC_OVERLOAD _CLC_INLINE float acospi(float x) { return acosf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double acospi(double x) { return acosd(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, acospi)
+UNARY_VEC_DECL(double, double, acospi)
+
+UNARY(asin)
+UNARY(asinh)
+
+_CLC_OVERLOAD _CLC_INLINE float asinpi(float x) { return asinf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double asinpi(double x) { return asind(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, asinpi)
+UNARY_VEC_DECL(double, double, asinpi)
+
+UNARY(atan)
+UNARY(atanh)
+
+_CLC_OVERLOAD _CLC_INLINE float atanpi(float x) { return atanf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double atanpi(double x) { return atand(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, atanpi)
+UNARY_VEC_DECL(double, double, atanpi)
+
+BINARY(atan2)
+
+_CLC_OVERLOAD _CLC_INLINE float atan2pi(float y, float x)
+ { return atan2f(y,x) * (float) M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double atan2pi(double y, double x)
+ { return atan2d(y,x) * M_1_PI; }
+BINARY_VEC_DECL(float, float, atan2pi)
+BINARY_VEC_DECL(double, double, atan2pi)
+
+UNARY(cbrt)
+UNARY(ceil)
+
+UNARY(cos)
+UNARY(cosh)
+
+_CLC_OVERLOAD _CLC_INLINE float cospi(float x) { return cosf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double cospi(double x) { return cosd(x) * M_PI; }
+UNARY_VEC_DECL(float, float, cospi)
+UNARY_VEC_DECL(double, double, cospi)
+
+UNARY(erf)
+UNARY(erfc)
+UNARY(exp)
+UNARY(exp2)
+
+UNARYT(float, float, exp10, builtin_exp10f)
+UNARYT(double, double, exp10, builtin_exp10)
+
+UNARY(expm1)
+
+UNARY_INLINE (float, float, fabs, _fabsf)\
+UNARY_INLINE (double, double, fabs, _fabs)\
+UNARY_VEC_DECL(float, float, fabs)
+UNARY_VEC_DECL(double, double, fabs)
+
+BINARY(fdim)
+UNARY(floor)
+
+TERNARY(fma)
+
+BINARY(fmax)
+BINARY(fmin)
+BINARY(fmod)
+BINARY(hypot)
+
+UNARYT_ALT(float, int, ilogb, ilogbf)
+UNARYT_ALT(double, int, ilogb, ilogbd)
+
+BINARY_INLINE_ALT (float, float, int, ldexp, ldexpf)
+BINARY_INLINE_ALT (double, double, int, ldexp, __builtin_ldexp)
+BINARY_VEC_DECL_ALT(float, float, int, ldexp)
+BINARY_VEC_DECL_ALT(double, double, int, ldexp)
+
+UNARY(lgamma)
+UNARY(log)
+UNARY(log2)
+UNARY(log10)
+UNARY(log1p)
+UNARY(logb)
+
+_CLC_OVERLOAD _CLC_INLINE float mad(float a, float b, float c) { return (a*b)+c; }
+_CLC_OVERLOAD _CLC_INLINE double mad(double a, double b, double c) { return (a*b)+c; }
+TERNARY_VEC_DECL(float, float, mad)
+TERNARY_VEC_DECL(double, double, mad)
+
+_CLC_OVERLOAD _CLC_INLINE float maxmag(float x, float y) { return fmax(fabs(x), fabs(y)); }
+_CLC_OVERLOAD _CLC_INLINE double maxmag(double x, double y) { return fmax(fabs(x), fabs(y)); }
+BINARY_VEC_DECL(float, float, maxmag)
+BINARY_VEC_DECL(double, double, maxmag)
+
+_CLC_OVERLOAD _CLC_INLINE float minmag(float x, float y) { return fmin(fabs(x), fabs(y)); }
+_CLC_OVERLOAD _CLC_INLINE double minmag(double x, double y) { return fmin(fabs(x), fabs(y)); }
+BINARY_VEC_DECL(float, float, minmag)
+BINARY_VEC_DECL(double, double, minmag)
+
+_CLC_OVERLOAD _CLC_INLINE float nan(uint nancode)
+ { return as_float(0x7FC00000 | nancode); }
+_CLC_OVERLOAD _CLC_INLINE double nan(ulong nancode)
+ { return as_double(0x7FF8000000000000ul | nancode); }
+UNARY_VEC_DECL(uint, float, nan)
+UNARY_VEC_DECL(ulong, double, nan)
+
+BINARY(nextafter)
+BINARY(pow)
+
+_CLC_PROTECTED double builtin_pow(double x, double y);
+
+BINARY_INLINE_ALT2 (float, float, int, pown, powf)
+BINARY_INLINE_ALT2 (double, double, int, pown, builtin_pow)
+BINARY_VEC_DECL_ALT(float, float, int, pown)
+BINARY_VEC_DECL_ALT(double, double, int, pown)
+
+_CLC_OVERLOAD _CLC_INLINE float powr(float x, float y) { return powf(x,y); }
+_CLC_OVERLOAD _CLC_INLINE double powr(double x, double y) { return pow(x,y); }
+BINARY_VEC_DECL(float, float, powr)
+BINARY_VEC_DECL(double, double, powr)
+
+BINARY(remainder)
+UNARY(rint)
+
+#define builtin_rootnf(a,b) (builtin_pow(a, 1.0f / (float) b))
+#define builtin_rootn(a,b) (builtin_pow(a, 1.0 / (double)b))
+
+BINARY_INLINE_ALT2 (float, float, int, rootn, builtin_rootnf)
+BINARY_INLINE_ALT2 (double, double, int, rootn, builtin_rootn)
+BINARY_VEC_DECL_ALT(float, float, int, rootn)
+BINARY_VEC_DECL_ALT(double, double, int, rootn)
+
+UNARY(round)
+UNARY(sqrt)
+
+_CLC_OVERLOAD _CLC_INLINE float rsqrt(float x) { return 1.0f/sqrtf(x); }
+_CLC_OVERLOAD _CLC_INLINE double rsqrt(double x) { return 1.0/sqrt(x); }
+UNARY_VEC_DECL(float, float, rsqrt)
+UNARY_VEC_DECL(double, double, rsqrt)
+
+UNARY(sin)
+UNARY(sinh)
+
+_CLC_OVERLOAD _CLC_INLINE float sinpi(float x) { return sinf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double sinpi(double x) { return sind(x) * M_PI; }
+UNARY_VEC_DECL(float, float, sinpi)
+UNARY_VEC_DECL(double, double, sinpi)
+
+UNARY(tan)
+UNARY(tanh)
+UNARY(trunc)
+
+_CLC_OVERLOAD _CLC_INLINE float tanpi(float x) { return tanf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double tanpi(double x) { return tand(x) * M_PI; }
+UNARY_VEC_DECL(float, float, tanpi)
+UNARY_VEC_DECL(double, double, tanpi)
+
+UNARY(tgamma)
+
+/*-----------------------------------------------------------------------------
+* Native versions
+*----------------------------------------------------------------------------*/
+#define native_sin(x) sin(x)
+#define native_cos(x) cos(x)
+#define native_tan(x) tan(x)
+#define native_powr(x,y) powr(x,y)
+#define native_exp(x) exp(x)
+#define native_exp2(x) exp2(x)
+#define native_exp10(x) exp10(x)
+#define native_log2(x) log2(x)
+#define native_log10(x) log10(x)
+
+_CLC_OVERLOAD _CLC_INLINE float native_divide(float x, float y) { return x/y; }
+_CLC_OVERLOAD _CLC_INLINE double native_divide(double x, double y) { return x/y; }
+BINARY_VEC_DECL(float, float, native_divide)
+BINARY_VEC_DECL(double, double, native_divide)
+
+_CLC_OVERLOAD _CLC_INLINE float native_recip(float x) { return (float)1/x; }
+_CLC_OVERLOAD _CLC_INLINE double native_recip(double x) { return (double)1/x; }
+UNARY_VEC_DECL(float, float, native_recip)
+UNARY_VEC_DECL(double, double, native_recip)
+
+#define native_rsqrt(x) rsqrt(x)
+#define native_sqrt(x) sqrt(x)
+
+/*-----------------------------------------------------------------------------
+* Half versions
+*----------------------------------------------------------------------------*/
+#define half_sin(x) sin(x)
+#define half_cos(x) cos(x)
+#define half_tan(x) tan(x)
+#define half_powr(x,y) powr(x,y)
+#define half_exp(x) exp(x)
+#define half_exp2(x) exp2(x)
+#define half_exp10(x) exp10(x)
+#define half_log(x) log(x)
+#define half_log2(x) log2(x)
+#define half_log10(x) log10(x)
+
+_CLC_OVERLOAD _CLC_INLINE float half_divide(float x, float y) { return x/y; }
+_CLC_OVERLOAD _CLC_INLINE double half_divide(double x, double y) { return x/y; }
+BINARY_VEC_DECL(float, float, half_divide)
+BINARY_VEC_DECL(double, double, half_divide)
+
+_CLC_OVERLOAD _CLC_INLINE float half_recip(float x) { return (float)1/x; }
+_CLC_OVERLOAD _CLC_INLINE double half_recip(double x) { return (double)1/x; }
+UNARY_VEC_DECL(float, float, half_recip)
+UNARY_VEC_DECL(double, double, half_recip)
+
+#define half_rsqrt(x) rsqrt(x)
+#define half_sqrt(x) sqrt(x)
+
+#undef UNARY
+#undef UNARTY
+#undef UNARTY_ALT
+#undef BINARY
+#undef TERNARY
+
+/*-----------------------------------------------------------------------------
+* Functions requiring change of pointer to address spaces
+*----------------------------------------------------------------------------*/
+#define SCALAR_BODY(type, op, ptr_type) \
+{ \
+ ptr_type power; \
+ type result = op(x, &power); \
+ *ptr = power; \
+ return result; \
+} \
+
+#define VECTOR_BODY_2(op, ptr_type) \
+ temp.s0 = op(x.s0, &(((ptr_type*)&itemp)[0])); \
+ temp.s1 = op(x.s1, &(((ptr_type*)&itemp)[1])); \
+
+#define VECTOR_BODY_3(op, ptr_type) \
+ VECTOR_BODY_2(op, ptr_type) \
+ temp.s2 = op(x.s2, &(((ptr_type*)&itemp)[2])); \
+
+#define VECTOR_BODY_4(op, ptr_type) \
+ VECTOR_BODY_3(op, ptr_type) \
+ temp.s3 = op(x.s3, &(((ptr_type*)&itemp)[3])); \
+
+#define VECTOR_BODY_8(op, ptr_type) \
+ VECTOR_BODY_4(op, ptr_type) \
+ temp.s4 = op(x.s4, &(((ptr_type*)&itemp)[4])); \
+ temp.s5 = op(x.s5, &(((ptr_type*)&itemp)[5])); \
+ temp.s6 = op(x.s6, &(((ptr_type*)&itemp)[6])); \
+ temp.s7 = op(x.s7, &(((ptr_type*)&itemp)[7])); \
+
+#define VECTOR_BODY_16(op, ptr_type) \
+ VECTOR_BODY_8(op, ptr_type) \
+ temp.s8 = op(x.s8, &(((ptr_type*)&itemp)[8])); \
+ temp.s9 = op(x.s9, &(((ptr_type*)&itemp)[9])); \
+ temp.sa = op(x.sa, &(((ptr_type*)&itemp)[10])); \
+ temp.sb = op(x.sb, &(((ptr_type*)&itemp)[11])); \
+ temp.sc = op(x.sc, &(((ptr_type*)&itemp)[12])); \
+ temp.sd = op(x.sd, &(((ptr_type*)&itemp)[13])); \
+ temp.se = op(x.se, &(((ptr_type*)&itemp)[14])); \
+ temp.sf = op(x.sf, &(((ptr_type*)&itemp)[15])); \
+
+#define VECTOR_BODY(prim_type, num, op, ptr_type) \
+{ \
+ prim_type##num temp; \
+ ptr_type##num itemp; \
+ VECTOR_BODY_##num(op, ptr_type)\
+ *ptr = itemp; \
+ return temp; \
+} \
+
+_CLC_OVERLOAD _CLC_DECL float modf(float x, global float * iptr);
+_CLC_OVERLOAD _CLC_DECL float modf(float x, local float * iptr);
+_CLC_OVERLOAD _CLC_DECL float modf(float x, private float * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, global float2 * iptr);
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, local float2 * iptr);
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, private float2 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, global float3 * iptr);
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, local float3 * iptr);
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, private float3 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, global float4 * iptr);
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, local float4 * iptr);
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, private float4 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, global float8 * iptr);
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, local float8 * iptr);
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, private float8 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, global float16 * iptr);
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, local float16 * iptr);
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, private float16 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double modf(double x, global double * iptr);
+_CLC_OVERLOAD _CLC_DECL double modf(double x, local double * iptr);
+_CLC_OVERLOAD _CLC_DECL double modf(double x, private double * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, global double2 * iptr);
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, local double2 * iptr);
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, private double2 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, global double3 * iptr);
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, local double3 * iptr);
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, private double3 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, global double4 * iptr);
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, local double4 * iptr);
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, private double4 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, global double8 * iptr);
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, local double8 * iptr);
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, private double8 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, global double16 * iptr);
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, local double16 * iptr);
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, private double16 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, private int16 * ptr);
+
+
+_CLC_OVERLOAD _CLC_DECL float fract(float x, global float * ptr);
+_CLC_OVERLOAD _CLC_DECL float fract(float x, local float * ptr);
+_CLC_OVERLOAD _CLC_DECL float fract(float x, private float * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, global float2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, local float2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, private float2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, global float3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, local float3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, private float3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, global float4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, local float4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, private float4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, global float8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, local float8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, private float8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, global float16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, local float16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, private float16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double fract(double x, global double * ptr);
+_CLC_OVERLOAD _CLC_DECL double fract(double x, local double * ptr);
+_CLC_OVERLOAD _CLC_DECL double fract(double x, private double * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, global double2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, local double2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, private double2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, global double3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, local double3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, private double3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, global double4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, local double4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, private double4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, global double8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, local double8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, private double8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, global double16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, local double16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, private double16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, global int * quo);
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, local int * quo);
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, private int * quo);
+
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, global int2 * quo);
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, local int2 * quo);
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, private int2 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, global int3 * quo);
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, local int3 * quo);
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, private int3 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, global int4 * quo);
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, local int4 * quo);
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, private int4 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, global int8 * quo);
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, local int8 * quo);
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, private int8 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, global int16 * quo);
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, local int16 * quo);
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, private int16 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, global int * quo);
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, local int * quo);
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, private int * quo);
+
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, global int2 * quo);
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, local int2 * quo);
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, private int2 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, global int3 * quo);
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, local int3 * quo);
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, private int3 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, global int4 * quo);
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, local int4 * quo);
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, private int4 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, global int8 * quo);
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, local int8 * quo);
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, private int8 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, global int16 * quo);
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, local int16 * quo);
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, private int16 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, global float * cosval);
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, local float * cosval);
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, private float * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, global float2 * cosval);
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, local float2 * cosval);
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, private float2 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, global float3 * cosval);
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, local float3 * cosval);
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, private float3 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, global float4 * cosval);
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, local float4 * cosval);
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, private float4 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, global float8 * cosval);
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, local float8 * cosval);
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, private float8 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, global float16 * cosval);
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, local float16 * cosval);
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, private float16 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, global double * cosval);
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, local double * cosval);
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, private double * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, global double2 * cosval);
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, local double2 * cosval);
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, private double2 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, global double3 * cosval);
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, local double3 * cosval);
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, private double3 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, global double4 * cosval);
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, local double4 * cosval);
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, private double4 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, global double8 * cosval);
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, local double8 * cosval);
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, private double8 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, global double16 * cosval);
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, local double16 * cosval);
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, private double16 * cosval);
+
+/*-----------------------------------------------------------------------------
+* Integer
+*----------------------------------------------------------------------------*/
+#define EXPAND_SIZES(type) \
+ SCALAR(type) \
+ TEMPLATE(_VEC_TYPE(type,2)) \
+ TEMPLATE(_VEC_TYPE(type,3)) \
+ TEMPLATE(_VEC_TYPE(type,4)) \
+ TEMPLATE(_VEC_TYPE(type,8)) \
+ TEMPLATE(_VEC_TYPE(type,16)) \
+
+#define TEMPLATE(gentype) \
+ _CLC_OVERLOAD _CLC_DECL gentype hadd(gentype x1, gentype x2);\
+ _CLC_OVERLOAD _CLC_DECL gentype rhadd(gentype x1, gentype x2);\
+
+#define SCALAR(gentype) \
+ _CLC_OVERLOAD _CLC_INLINE gentype hadd(gentype x, gentype y) \
+ { return (x >> 1) + (y >> 1) + (x & y & 1); } \
+ _CLC_OVERLOAD _CLC_INLINE gentype rhadd(gentype x, gentype y) \
+ { return (x >> 1) + (y >> 1) + ((x&1)|(y&1)); } \
+
+_EXPAND_INTEGER_TYPES()
+
+#undef EXPAND_SIZES
+#undef SCALAR
+#undef TEMPLATE
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ DECLARATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype clamp(gentype x, gentype minval, gentype maxval); \
+_CLC_OVERLOAD _CLC_DECL gentype clamp(gentype x, sgentype minval, sgentype maxval); \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype clamp(gentype x, gentype minval, gentype maxval) \
+ { return x > maxval ? maxval : x < minval ? minval : x; } \
+
+_EXPAND_TYPES()
+
+#undef EXPAND_SIZES
+#undef IMPLEMENTATION
+#undef DECLARATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype min(gentype x, gentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype min(gentype x, sgentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype max(gentype x, gentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype max(gentype x, sgentype y); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, sgentype y) \
+ { return (gentype)y < x ? (gentype)y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, sgentype y) \
+ { return (gentype)y > x ? (gentype)y : x; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+
+_EXPAND_TYPES()
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype mix(gentype x, gentype y, gentype a); \
+_CLC_OVERLOAD _CLC_DECL gentype mix(gentype x, gentype y, sgentype a); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, sgentype a) \
+ { return x + (y-x) * (gentype)a; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type, utype) \
+ TEMPLATE(_VEC_TYPE(type,2), _VEC_TYPE(utype,2)) \
+ TEMPLATE(_VEC_TYPE(type,3), _VEC_TYPE(utype,3)) \
+ TEMPLATE(_VEC_TYPE(type,4), _VEC_TYPE(utype,4)) \
+ TEMPLATE(_VEC_TYPE(type,8), _VEC_TYPE(utype,8)) \
+ TEMPLATE(_VEC_TYPE(type,16), _VEC_TYPE(utype,16)) \
+
+#define TEMPLATE(gentype, ugentype) \
+ _CLC_OVERLOAD _CLC_DECL ugentype abs_diff(gentype x, gentype y);\
+
+EXPAND_SIZES(char, uchar)
+EXPAND_SIZES(uchar, uchar)
+EXPAND_SIZES(short, ushort)
+EXPAND_SIZES(ushort, ushort)
+EXPAND_SIZES(int, uint)
+EXPAND_SIZES(uint, uint)
+EXPAND_SIZES(long, ulong)
+EXPAND_SIZES(ulong, ulong)
+
+_CLC_OVERLOAD _CLC_INLINE uchar abs_diff (char x, char y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE uchar abs_diff (uchar x, uchar y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ushort abs_diff (short x, short y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ushort abs_diff (ushort x, ushort y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE uint abs_diff (uint x, uint y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ulong abs_diff (ulong x, ulong y) { return x>y ? x-y : y-x; }
+
+_CLC_OVERLOAD _CLC_DECL uint abs_diff(int x, int y);
+_CLC_OVERLOAD _CLC_DECL ulong abs_diff(long x, long y);
+
+#undef EXPAND_SIZES
+#undef TEMPLATE
+
+#define mad_hi(a, b, c) (mul_hi((a),(b))+(c))
+#define mul24(a, b) ((a)*(b))
+#define mad24(a, b, c) (((a)*(b))+(c))
+
+/*-----------------------------------------------------------------------------
+* Common
+*----------------------------------------------------------------------------*/
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ DECLARATION(_VEC_TYPE(type,3)) \
+ DECLARATION(_VEC_TYPE(type,4)) \
+ DECLARATION(_VEC_TYPE(type,8)) \
+ DECLARATION(_VEC_TYPE(type,16)) \
+
+#define DECLARATION(gentype) \
+_CLC_OVERLOAD _CLC_DECL gentype degrees(gentype radians); \
+_CLC_OVERLOAD _CLC_DECL gentype radians(gentype degrees); \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype degrees(gentype radians) { return radians * (gentype)180.0 * (gentype)M_1_PI; } \
+_CLC_OVERLOAD _CLC_INLINE gentype radians(gentype degrees) { return degrees * (gentype)M_PI / (gentype)180.0; }
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype step(gentype edge, gentype x); \
+_CLC_OVERLOAD _CLC_DECL gentype step(sgentype edge, gentype x); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype step(gentype edge, gentype x) \
+ { return x < edge ? (gentype)0.0 : (gentype)1.0 ; } \
+_CLC_OVERLOAD _CLC_INLINE gentype step(sgentype edge, gentype x) \
+ { return x < (gentype)edge ? (gentype)0.0 : (gentype)1.0 ; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype step(gentype edge, gentype x) \
+ { return x < edge ? 0.0 : 1.0 ; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+_CLC_OVERLOAD _CLC_DECL float smoothstep(float edge0, float edge1, float x);
+_CLC_OVERLOAD _CLC_DECL float2 smoothstep(float2 edge0, float2 edge1,
+ float2 x);
+_CLC_OVERLOAD _CLC_DECL float3 smoothstep(float3 edge0, float3 edge1,
+ float3 x);
+_CLC_OVERLOAD _CLC_DECL float4 smoothstep(float4 edge0, float4 edge1,
+ float4 x);
+_CLC_OVERLOAD _CLC_DECL float8 smoothstep(float8 edge0, float8 edge1,
+ float8 x);
+_CLC_OVERLOAD _CLC_DECL float16 smoothstep(float16 edge0, float16 edge1,
+ float16 x);
+
+_CLC_OVERLOAD _CLC_DECL float2 smoothstep(float edge0, float edge1, float2 x);
+_CLC_OVERLOAD _CLC_DECL float3 smoothstep(float edge0, float edge1, float3 x);
+_CLC_OVERLOAD _CLC_DECL float4 smoothstep(float edge0, float edge1, float4 x);
+_CLC_OVERLOAD _CLC_DECL float8 smoothstep(float edge0, float edge1, float8 x);
+_CLC_OVERLOAD _CLC_DECL float16 smoothstep(float edge0, float edge1, float16 x);
+
+_CLC_OVERLOAD _CLC_DECL double smoothstep(double edge0, double edge1, double x);
+_CLC_OVERLOAD _CLC_DECL double2 smoothstep(double2 edge0, double2 edge1,
+ double2 x);
+_CLC_OVERLOAD _CLC_DECL double3 smoothstep(double3 edge0, double3 edge1,
+ double3 x);
+_CLC_OVERLOAD _CLC_DECL double4 smoothstep(double4 edge0, double4 edge1,
+ double4 x);
+_CLC_OVERLOAD _CLC_DECL double8 smoothstep(double8 edge0, double8 edge1,
+ double8 x);
+_CLC_OVERLOAD _CLC_DECL double16 smoothstep(double16 edge0, double16 edge1,
+ double16 x);
+
+_CLC_OVERLOAD _CLC_DECL double2 smoothstep(double edge0, double edge1,
+ double2 x);
+_CLC_OVERLOAD _CLC_DECL double3 smoothstep(double edge0, double edge1,
+ double3 x);
+_CLC_OVERLOAD _CLC_DECL double4 smoothstep(double edge0, double edge1,
+ double4 x);
+_CLC_OVERLOAD _CLC_DECL double8 smoothstep(double edge0, double edge1,
+ double8 x);
+_CLC_OVERLOAD _CLC_DECL double16 smoothstep(double edge0, double edge1,
+ double16 x);
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ DECLARATION(_VEC_TYPE(type,3)) \
+ DECLARATION(_VEC_TYPE(type,4)) \
+ DECLARATION(_VEC_TYPE(type,8)) \
+ DECLARATION(_VEC_TYPE(type,16)) \
+
+#define DECLARATION(gentype) \
+_CLC_OVERLOAD _CLC_DECL gentype sign(gentype x); \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype sign(gentype x) \
+{ return x > (gentype)0.0 ? (gentype) 1.0 : \
+ x < (gentype)0.0 ? (gentype)-1.0 : \
+ isnan(x) ? (gentype) 0.0 : x; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+
+/*-----------------------------------------------------------------------------
+* Geometric
+*----------------------------------------------------------------------------*/
+_CLC_OVERLOAD _CLC_INLINE float dot(float p0, float p1) {return p0*p1;}
+_CLC_OVERLOAD _CLC_INLINE float dot(float2 p0, float2 p1) {return p0.x*p1.x+p0.y*p1.y;}
+_CLC_OVERLOAD _CLC_DECL float dot(float3 p0, float3 p1);
+_CLC_OVERLOAD _CLC_DECL float dot(float4 p0, float4 p1);
+_CLC_OVERLOAD _CLC_INLINE double dot(double p0, double p1) {return p0*p1;}
+_CLC_OVERLOAD _CLC_INLINE double dot(double2 p0, double2 p1) {return p0.x*p1.x+p0.y*p1.y;}
+_CLC_OVERLOAD _CLC_DECL double dot(double3 p0, double3 p1) ;
+_CLC_OVERLOAD _CLC_DECL double dot(double4 p0, double4 p1) ;
+
+_CLC_OVERLOAD _CLC_DECL float3 cross(float3 p0, float3 p1);
+_CLC_OVERLOAD _CLC_DECL float4 cross(float4 p0, float4 p1);
+_CLC_OVERLOAD _CLC_DECL double3 cross(double3 p0, double3 p1);
+_CLC_OVERLOAD _CLC_DECL double4 cross(double4 p0, double4 p1);
+
+_CLC_OVERLOAD _CLC_INLINE float length(float p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE double length(double p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE float fast_length(float p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE double fast_length(double p) {return fabs(p);}
+
+_CLC_OVERLOAD _CLC_DECL float length(float2 p);
+_CLC_OVERLOAD _CLC_DECL float length(float3 p);
+_CLC_OVERLOAD _CLC_DECL float length(float4 p);
+_CLC_OVERLOAD _CLC_DECL double length(double2 p);
+_CLC_OVERLOAD _CLC_DECL double length(double3 p);
+_CLC_OVERLOAD _CLC_DECL double length(double4 p);
+
+_CLC_OVERLOAD _CLC_DECL float fast_length(float2 p);
+_CLC_OVERLOAD _CLC_DECL float fast_length(float3 p);
+_CLC_OVERLOAD _CLC_DECL float fast_length(float4 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double2 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double3 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double4 p);
+
+_CLC_OVERLOAD _CLC_INLINE float distance(float p0, float p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE float distance(float2 p0, float2 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float distance(float3 p0, float3 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float distance(float4 p0, float4 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double p0, double p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE double distance(double2 p0, double2 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double3 p0, double3 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double4 p0, double4 p1) { return length(p1-p0); }
+
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float p0, float p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float2 p0, float2 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float3 p0, float3 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float4 p0, float4 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double p0, double p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double2 p0, double2 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double3 p0, double3 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double4 p0, double4 p1) { return fast_length(p1-p0); }
+
+_CLC_OVERLOAD _CLC_INLINE float normalize(float p)
+{return p > 0.0f ? 1.0f : p < 0.0f ? -1.0f : 0.0f;}
+
+_CLC_OVERLOAD _CLC_INLINE double normalize(double p)
+{return p > 0.0 ? 1.0 : p < 0.0 ? -1.0 : 0.0;}
+
+_CLC_OVERLOAD _CLC_INLINE float fast_normalize(float p)
+{return p > 0.0f ? 1.0f : p < 0.0f ? -1.0f : 0.0f;}
+
+_CLC_OVERLOAD _CLC_INLINE double fast_normalize(double p)
+{return p > 0.0 ? 1.0 : p < 0.0 ? -1.0 : 0.0;}
+
+_CLC_OVERLOAD _CLC_INLINE float2 normalize(float2 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE float3 normalize(float3 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE float4 normalize(float4 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double2 normalize(double2 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double3 normalize(double3 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double4 normalize(double4 p) { return p / length(p); }
+
+_CLC_OVERLOAD _CLC_INLINE float2 fast_normalize(float2 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE float3 fast_normalize(float3 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE float4 fast_normalize(float4 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double2 fast_normalize(double2 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double3 fast_normalize(double3 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double4 fast_normalize(double4 p) { return p / fast_length(p); }
+
+/*-----------------------------------------------------------------------------
+* Atomics
+*----------------------------------------------------------------------------*/
+_CLC_OVERLOAD _CLC_DECL int atomic_add(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_add(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_add(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_add(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_sub(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_sub(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_sub(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_sub(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_xchg(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xchg(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL float atomic_xchg(volatile global float* p, float val);
+_CLC_OVERLOAD _CLC_DECL int atomic_xchg(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xchg(volatile local uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL float atomic_xchg(volatile local float* p, float val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_inc(volatile global int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_inc(volatile global uint* p);
+_CLC_OVERLOAD _CLC_DECL int atomic_inc(volatile local int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_inc(volatile local uint* p);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_dec(volatile global int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_dec(volatile global uint* p);
+_CLC_OVERLOAD _CLC_DECL int atomic_dec(volatile local int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_dec(volatile local uint* p);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_cmpxchg(volatile global int* p, int cmp, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_cmpxchg(volatile global uint* p, uint cmp, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_cmpxchg(volatile local int* p, int cmp, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_cmpxchg(volatile local uint* p, uint cmp, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_min(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_min(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_min(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_min(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_max(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_max(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_max(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_max(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_and(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_and(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_and(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_and(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_or(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_or(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_or(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_or(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_xor(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xor(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_xor(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xor(volatile local uint* p, uint val);
+
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_xchg atomic_xchg
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+
+#define TEMPLATE2(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle(res_elemt##val_vnum val, mask_elemt##2 mask);\
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##2 mask);
+
+
+#define TEMPLATE4(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle(res_elemt##val_vnum val, mask_elemt##4 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##4 mask);
+
+
+#define TEMPLATE8(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle(res_elemt##val_vnum val, mask_elemt##8 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##8 mask);
+
+
+#define TEMPLATE16(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle(res_elemt##val_vnum val, mask_elemt##16 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##16 mask);
+
+#define CROSS_SIZE(type1, type2) \
+TEMPLATE2(type1, 2, type2) \
+TEMPLATE2(type1, 4, type2) \
+TEMPLATE2(type1, 8, type2) \
+TEMPLATE2(type1, 16, type2) \
+TEMPLATE4(type1, 2, type2) \
+TEMPLATE4(type1, 4, type2) \
+TEMPLATE4(type1, 8, type2) \
+TEMPLATE4(type1, 16, type2) \
+TEMPLATE8(type1, 2, type2) \
+TEMPLATE8(type1, 4, type2) \
+TEMPLATE8(type1, 8, type2) \
+TEMPLATE8(type1, 16, type2) \
+TEMPLATE16(type1, 2, type2) \
+TEMPLATE16(type1, 4, type2) \
+TEMPLATE16(type1, 8, type2) \
+TEMPLATE16(type1, 16, type2) \
+
+#define CROSS_MASKTYPE(type) \
+CROSS_SIZE(type, uchar) \
+CROSS_SIZE(type, ushort) \
+CROSS_SIZE(type, uint) \
+CROSS_SIZE(type, ulong) \
+
+CROSS_MASKTYPE(char)
+CROSS_MASKTYPE(uchar)
+CROSS_MASKTYPE(short)
+CROSS_MASKTYPE(ushort)
+CROSS_MASKTYPE(int)
+CROSS_MASKTYPE(uint)
+CROSS_MASKTYPE(long)
+CROSS_MASKTYPE(ulong)
+CROSS_MASKTYPE(float)
+CROSS_MASKTYPE(double)
+
+#undef TEMPLATE2
+#undef TEMPLATE4
+#undef TEMPLATE8
+#undef TEMPLATE16
+#undef CROSS_SIZE
+#undef CROSS_MASKTYPE
+
+#endif //_CLC_H_