Skip to content
This repository was archived by the owner on Aug 11, 2020. It is now read-only.

Commit 0b4cedd

Browse files
rahul003piiswrong
authored andcommitted
Leveraging F16C x86 instruction set architecture extension for half datatype (#330)
* f16c changes * cast double to float for conversion to uint6_t * undo line break removal * whitespace changes * lint fix * detect f16c support on system * remove earlier setting of var * comment * cmake support for linux and mac * cmake support * lint fixes
1 parent b3771de commit 0b4cedd

File tree

4 files changed

+74
-3
lines changed

4 files changed

+74
-3
lines changed

cmake/mshadow.cmake

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,31 @@ else()
4848
add_definitions(-DMSHADOW_USE_SSE=0)
4949
endif()
5050

51+
if(NOT DEFINED SUPPORT_MF16C AND NOT MSVC)
52+
check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORT_MF16C)
53+
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
54+
execute_process(COMMAND cat /proc/cpuinfo
55+
COMMAND grep flags
56+
COMMAND grep f16c
57+
OUTPUT_VARIABLE CPU_SUPPORT_F16C)
58+
elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
59+
execute_process(COMMAND sysctl -a
60+
COMMAND grep machdep.cpu.features
61+
COMMAND grep F16C
62+
OUTPUT_VARIABLE CPU_SUPPORT_F16C)
63+
endif()
64+
if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C)
65+
set(SUPPORT_MF16C TRUE)
66+
endif()
67+
endif()
68+
69+
if(SUPPORT_MF16C)
70+
add_definitions(-DMSHADOW_USE_F16C=1)
71+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")
72+
else()
73+
add_definitions(-DMSHADOW_USE_F16C=0)
74+
endif()
75+
5176
if(USE_CUDA)
5277
find_package(CUDA 5.5 QUIET)
5378
find_cuda_helper_libs(curand)

make/mshadow.mk

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,32 @@ else
3131
MSHADOW_CFLAGS += -DMSHADOW_USE_SSE=0
3232
endif
3333

34+
ifndef USE_F16C
35+
ifneq ($(OS),Windows_NT)
36+
detected_OS := $(shell uname -s)
37+
ifeq ($(detected_OS),Darwin)
38+
F16C_SUPP = $(shell sysctl -a | grep machdep.cpu.features | grep F16C)
39+
endif
40+
ifeq ($(detected_OS),Linux)
41+
F16C_SUPP = $(shell cat /proc/cpuinfo | grep flags | grep f16c)
42+
endif
43+
ifneq ($(F16C_SUPP), NONE)
44+
USE_F16C=1
45+
else
46+
USE_F16C=0
47+
endif
48+
endif
49+
# if OS is Windows, check if your processor supports F16C architecture.
50+
# One way to do that is to download the tool https://docs.microsoft.com/en-us/sysinternals/downloads/coreinfo.
51+
# If coreinfo -c shows F16C then you can set USE_F16C=1 explicitly to leverage that capability"
52+
endif
53+
54+
ifeq ($(USE_F16C), 1)
55+
MSHADOW_CFLAGS += -mf16c
56+
else
57+
MSHADOW_CFLAGS += -DMSHADOW_USE_F16C=0
58+
endif
59+
3460
ifeq ($(USE_CUDA), 0)
3561
MSHADOW_CFLAGS += -DMSHADOW_USE_CUDA=0
3662
else

mshadow/base.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ typedef unsigned __int64 uint64_t;
134134
#ifndef MSHADOW_USE_SSE
135135
#define MSHADOW_USE_SSE 1
136136
#endif
137+
138+
/*! \brief whether use F16C instruction set architecture extension */
139+
#ifndef MSHADOW_USE_F16C
140+
#define MSHADOW_USE_F16C 1
141+
#endif
142+
137143
/*! \brief whether use NVML to get dynamic info */
138144
#ifndef MSHADOW_USE_NVML
139145
#define MSHADOW_USE_NVML 0

mshadow/half.h

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#define MSHADOW_HALF_H_
1010
#include "./base.h"
1111

12+
#if MSHADOW_USE_F16C
13+
#include <x86intrin.h>
14+
#endif // MSHADOW_USE_F16C
15+
1216
#if (MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
1317
#define MSHADOW_CUDA_HALF 1
1418
#include <cuda_fp16.h>
@@ -61,7 +65,15 @@ namespace half {
6165
return T(__half2float(cuhalf_)); /* NOLINT(*)*/ \
6266
} \
6367
MSHADOW_XINLINE operator T() const volatile { \
64-
return T(__half2float_warp(cuhalf_)); /* NOLINT(*)*/ \
68+
return T(__half2float_warp(cuhalf_)); /* NOLINT(*)*/ \
69+
}
70+
#elif(MSHADOW_USE_F16C)
71+
#define MSHADOW_HALF_CONVERSIONOP(T) \
72+
MSHADOW_XINLINE operator T() const { \
73+
return T(_cvtsh_ss(half_)); /* NOLINT(*)*/ \
74+
} \
75+
MSHADOW_XINLINE operator T() const volatile { \
76+
return T(_cvtsh_ss(half_)); /* NOLINT(*)*/ \
6577
}
6678
#else
6779
#define MSHADOW_HALF_CONVERSIONOP(T) \
@@ -244,9 +256,11 @@ class MSHADOW_ALIGNED(2) half_t {
244256
MSHADOW_XINLINE void constructor(const T& value) {
245257
#if (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
246258
cuhalf_ = __float2half(float(value)); // NOLINT(*)
247-
#else
259+
#elif(MSHADOW_USE_F16C)
260+
half_ = _cvtss_sh(static_cast<float>(value), 0);
261+
#else /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
248262
half_ = float2half(float(value)); // NOLINT(*)
249-
#endif // (MSHADOW_CUDA_HALF && defined(__CUDA_ARCH__))
263+
#endif /* !MSHADOW_CUDA_HALF && !MSHADOW_USE_F16C */
250264
}
251265
};
252266

0 commit comments

Comments
 (0)