@@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
714714 dst[row] = tmp[0 ];
715715 }
716716}
717-
718717);
719718
720719
@@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
784783 dst[row] = tmp[0 ];
785784 }
786785}
786+
787787);
788788
789789
@@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
799799}
800800);
801801
802+ std::string add_template = MULTILINE_QUOTE(
803+ __kernel void add_f32 (__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
804+ const int i = get_group_id (0 )*get_local_size (0 ) + get_local_id (0 );
805+
806+ if (i >= get_global_size (0 )) {
807+ return ;
808+ }
809+
810+ dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
811+ }
812+ );
813+
802814#define CL_CHECK (err ) \
803815 do { \
804816 cl_int err_ = (err); \
@@ -878,6 +890,7 @@ static std::string generate_kernels() {
878890 }
879891 src << mul_kernel << ' \n ' ;
880892 }
893+ src << add_template << ' \n ' ;
881894
882895 return src.str ();
883896}
@@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
893906static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
894907static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
895908static cl_kernel mul_f32_cl;
909+ static cl_kernel add_f32_cl;
896910static bool fp16_support;
897911
898912static cl_program build_program_from_source (cl_context ctx, cl_device_id dev, const char * program_buffer) {
@@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
11001114 char *ext_buffer = (char *)alloca (ext_str_size + 1 );
11011115 clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL );
11021116 ext_buffer[ext_str_size] = ' \0 ' ; // ensure it is null terminated
1117+ // Disabled due to faulty outputs
11031118 // Check if ext_buffer contains cl_khr_fp16
1104- fp16_support = strstr (ext_buffer, " cl_khr_fp16" ) != NULL ;
1105- fprintf (stderr, " ggml_opencl: device FP16 support: %s\n " , fp16_support ? " true" : " false" );
1119+ fp16_support = false ; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
1120+ // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
11061121
11071122 cl_context_properties properties[] = {
11081123 (intptr_t )CL_CONTEXT_PLATFORM, (intptr_t )platform, 0
@@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
11501165
11511166 // mul kernel
11521167 CL_CHECK ((mul_f32_cl = clCreateKernel (program, " mul_f32" , &err), err));
1168+
1169+ CL_CHECK ((add_f32_cl = clCreateKernel (program, " add_f32" , &err), err));
11531170}
11541171
11551172static cl_kernel* ggml_get_to_fp32_cl (ggml_type type) {
@@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
14581475 ggml_cl_mul_f32 (src0, src1, dst);
14591476}
14601477
1478+ static void ggml_cl_add_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479+ GGML_ASSERT (src1->backend == GGML_BACKEND_GPU);
1480+ const int64_t ne00 = src0->ne [0 ];
1481+ const int64_t ne01 = src0->ne [1 ];
1482+ const int64_t ne02 = src0->ne [2 ];
1483+ const int64_t ne03 = src0->ne [3 ];
1484+ const int64_t ne10 = src1->ne [0 ];
1485+ const int64_t ne11 = src1->ne [1 ];
1486+ const int64_t ne12 = src1->ne [2 ];
1487+ const int64_t ne13 = src1->ne [3 ];
1488+ const int nb2 = dst->nb [2 ];
1489+ const int nb3 = dst->nb [3 ];
1490+ size_t x_size;
1491+ size_t d_size;
1492+
1493+ cl_mem d_X = ggml_cl_pool_malloc (ne00 * ne01 * sizeof (float ), &x_size); // src0
1494+ cl_mem d_Y = (cl_mem) src1->extra ; // src1 is already on device, broadcasted.
1495+ cl_mem d_D = ggml_cl_pool_malloc (ne00 * ne01 * sizeof (float ), &d_size); // dst
1496+
1497+
1498+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1499+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1500+ cl_event ev;
1501+
1502+ // copy src0 to device
1503+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, &ev));
1504+
1505+ const int64_t i13 = i03%ne13;
1506+ const int64_t i12 = i02%ne12;
1507+ const int i1 = i13*ne12*ne11 + i12*ne11;
1508+
1509+ cl_int x_offset = 0 ;
1510+ cl_int y_offset = i1*ne10;
1511+ cl_int d_offset = 0 ;
1512+
1513+ size_t global = ne00 * ne01;
1514+ cl_int ky = ne10 * ne11;
1515+
1516+ CL_CHECK (clSetKernelArg (add_f32_cl, 0 , sizeof (cl_mem), &d_X));
1517+ CL_CHECK (clSetKernelArg (add_f32_cl, 1 , sizeof (cl_int), &x_offset));
1518+ CL_CHECK (clSetKernelArg (add_f32_cl, 2 , sizeof (cl_mem), &d_Y));
1519+ CL_CHECK (clSetKernelArg (add_f32_cl, 3 , sizeof (cl_int), &y_offset));
1520+ CL_CHECK (clSetKernelArg (add_f32_cl, 4 , sizeof (cl_mem), &d_D));
1521+ CL_CHECK (clSetKernelArg (add_f32_cl, 5 , sizeof (cl_int), &d_offset));
1522+ CL_CHECK (clSetKernelArg (add_f32_cl, 6 , sizeof (cl_int), &ky));
1523+ CL_CHECK (clEnqueueNDRangeKernel (queue, add_f32_cl, 1 , NULL , &global, NULL , 1 , &ev, NULL ));
1524+
1525+ CL_CHECK (clReleaseEvent (ev));
1526+ CL_CHECK (clFinish (queue));
1527+
1528+ // copy dst to host
1529+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1530+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * ne00*ne01, d, 0 , NULL , NULL ));
1531+ }
1532+ }
1533+ ggml_cl_pool_free (d_X, x_size);
1534+ ggml_cl_pool_free (d_D, d_size);
1535+ }
1536+
1537+ void ggml_cl_add (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1538+ GGML_ASSERT (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1539+ ggml_cl_add_f32 (src0, src1, dst);
1540+ }
1541+
14611542static void ggml_cl_mul_mat_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
14621543 const int64_t ne00 = src0->ne [0 ];
14631544 const int64_t ne01 = src0->ne [1 ];
0 commit comments