-
-
Notifications
You must be signed in to change notification settings - Fork 100
Avoid/limit heap allocation/seq (shape/strides) in tight loop #89
Copy link
Copy link
Closed
Labels
Description
Naive benchmarking shows that "shape_to_strides", seq assignation and creation generates a constant non-negligeable overhead - 60% of time spent in seq management for this benchmark:
bench
(available in "benchmarks" folder)
import ../src/arraymancer_nn, ../src/arraymancer_ag, ../src/arraymancer
let ctx = newContext Tensor[float32]
let bsz = 32 #batch size
# We will create a tensor of size 3200 --> 100 batch sizes of 32
# We create it as int between [0, 2[ (2 excluded) and convert to bool
let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
proc xor_alt[T](x,y: T): T =
## xor is builtin and cannot be passed to map as is
x xor y
let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1])
# Convert to float and transpose so batch_size is last
let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
let y = y_bool.astype(float32).transpose
# First hidden layer of 3 neurons, with 2 features in
# We initialize with random weights between -1 and 1
let layer_3neurons = ctx.variable(
randomTensor(3, 2, 2.0f) .- 1.0f
)
# Classifier layer with 1 neuron per feature. (In our case only one neuron overall)
# We initialize with random weights between -1 and 1
let classifier_layer = ctx.variable(
randomTensor(1, 3, 2.0f) .- 1.0f
)
# Stochastic Gradient Descent
let optim = newSGD[float32](
layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
)
for epoch in 0..100:
for batch_id in 0..<100:
# offset in the Tensor (Remember, batch size is last)
let offset = batch_id * 32
let x = x_train[_, offset ..< offset + 32]
let target = y[_, offset ..< offset + 32]
# Building the network
let n1 = linear(x, layer_3neurons)
let n1_act = n1.relu
let n2 = linear(n1_act, classifier_layer)
let loss = sigmoid_cross_entropy(n2, target)
# Compute the gradient (i.e. contribution of each parameter to the loss)
loss.backprop()
# Correct the weights now that we have the gradient information
optim.update()Trace
Weight Self Weight Symbol Name
192.00 ms 60.3% 0 s ex01_xor (23883)
192.00 ms 60.3% 0 s Main Thread 0x62524
192.00 ms 60.3% 0 s start
192.00 ms 60.3% 0 s main
192.00 ms 60.3% 2.00 ms NimMainModule
57.00 ms 17.9% 1.00 ms backprop_oVdk9aMLcybHCChVR6aQk5g
51.00 ms 16.0% 0 s backward_Fqpw21lFPHWNdNORshaa9cA
6.00 ms 1.8% 2.00 ms map2_RoKBnD6H0IuZSmQLcCMZ1g
4.00 ms 1.2% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
4.00 ms 1.2% 2.00 ms X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
2.00 ms 0.6% 0 s newSeq
12.00 ms 3.7% 1.00 ms map_7sNmoBM0FdFmPtxsYkKirA
2.00 ms 0.6% 0 s at__7KXK9aqsdE0ndrhB7KxewvA
2.00 ms 0.6% 0 s newSeq
1.00 ms 0.3% 0 s nimNewSeqOfCap
8.00 ms 2.5% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms 0.9% 0 s X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
3.00 ms 0.9% 0 s newSeq
4.00 ms 1.2% 1.00 ms amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
3.00 ms 0.9% 2.00 ms newSeq
1.00 ms 0.3% 0 s genericSeqAssign
8.00 ms 2.5% 0 s reversed_S4WoGleqxGOb3jjUvFKyfA
8.00 ms 2.5% 0 s newSeq
24.00 ms 7.5% 2.00 ms star__xQtjCZX3EuzWXnc0t9bCM2w
1.00 ms 0.3% 0 s newSeq
4.00 ms 1.2% 0 s nimNewSeqOfCap
1.00 ms 0.3% 1.00 ms setLengthSeq
16.00 ms 5.0% 2.00 ms unsafeContiguous_Nck5nnO9bAJVgi7JMAI8knA
14.00 ms 4.4% 0 s genericSeqAssign
1.00 ms 0.3% 0 s unsafeBroadcast_9aZErpPXuidMn9cbDgghwtrg
1.00 ms 0.3% 0 s genericSeqAssign
1.00 ms 0.3% 0 s genericSeqAssign
1.00 ms 0.3% 0 s newSeq
3.00 ms 0.9% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
2.00 ms 0.6% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
2.00 ms 0.6% 0 s newSeq
1.00 ms 0.3% 0 s genericSeqAssign
1.00 ms 0.3% 1.00 ms incrSeqV2
21.00 ms 6.6% 1.00 ms linear_9aa0crPUgSzc3WGSOrKwanw
20.00 ms 6.2% 0 s forward_pdrb9bebPpNDv5TauQs8LOgex01_xor
1.00 ms 0.3% 0 s newSeq
1.00 ms 0.3% 1.00 ms newSeq_mi9afQ1klNXRFnVSLwJV9aVg
5.00 ms 1.5% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms 0.9% 0 s X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
3.00 ms 0.9% 0 s newSeq
1.00 ms 0.3% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms 0.3% 0 s newSeq
1.00 ms 0.3% 0 s genericSeqAssign
13.00 ms 4.0% 0 s star__xQtjCZX3EuzWXnc0t9bCM2w
1.00 ms 0.3% 0 s newSeq
12.00 ms 3.7% 1.00 ms unsafeContiguous_Nck5nnO9bAJVgi7JMAI8knA
11.00 ms 3.4% 1.00 ms genericSeqAssign
1.00 ms 0.3% 0 s randomTensor_0CLBTaXQo1slbLknroeFow
1.00 ms 0.3% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms 0.3% 0 s genericSeqAssign
11.00 ms 3.4% 0 s relu_SZbqcSLLEQfQBKnhXXEa0w
7.00 ms 2.2% 0 s forward_afTd72d9apMokICtszjdsPAex01_xor
1.00 ms 0.3% 0 s at__7KXK9aqsdE0ndrhB7KxewvA
1.00 ms 0.3% 0 s newSeq
4.00 ms 1.2% 2.00 ms map_7sNmoBM0FdFmPtxsYkKirA
1.00 ms 0.3% 0 s nimNewSeqOfCap
1.00 ms 0.3% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms 0.3% 0 s X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms 0.3% 1.00 ms newSeq
2.00 ms 0.6% 1.00 ms shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms 0.3% 0 s genericSeqAssign
4.00 ms 1.2% 0 s genericSeqAssign
50.00 ms 15.7% 0 s sigmoid_cross_entropy_Yau9cGp7xu7MB2nxk5jZa9cQ
29.00 ms 9.1% 2.00 ms forward_eiS5bzXq9cybpN9bAe3jgt5Aex01_xor
2.00 ms 0.6% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms 0.3% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms 0.3% 0 s newSeq
1.00 ms 0.3% 0 s newSeq
25.00 ms 7.8% 0 s toTensor_PDoWBw7dhWertuPrbd3nqQ
8.00 ms 2.5% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
8.00 ms 2.5% 1.00 ms newSeq
3.00 ms 0.9% 0 s genericSeqAssign
1.00 ms 0.3% 0 s incrSeqV2
1.00 ms 0.3% 1.00 ms growObj_FZeyQYjWPcE9c06y1gNqZxw
4.00 ms 1.2% 2.00 ms newSeq
9.00 ms 2.8% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms 0.3% 0 s X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms 0.3% 1.00 ms newSeq
8.00 ms 2.5% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
8.00 ms 2.5% 0 s newSeq
21.00 ms 6.6% 0 s genericSeqAssign
26.00 ms 8.1% 2.00 ms slicer_BD1F1oU9a9cLZM9aHXZ2JbVKw_2
24.00 ms 7.5% 0 s genericSeqAssign
2.00 ms 0.6% 1.00 ms unsafeSlicer_BD1F1oU9a9cLZM9aHXZ2JbVKw
1.00 ms 0.3% 0 s genericSeqAssign
21.00 ms 6.6% 1.00 ms update_4t6MKNnjrt9b9cUtnIk3Iizg
16.00 ms 5.0% 1.00 ms map_7sNmoBM0FdFmPtxsYkKirA
1.00 ms 0.3% 0 s at__7KXK9aqsdE0ndrhB7KxewvA
1.00 ms 0.3% 0 s newSeq
14.00 ms 4.4% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
9.00 ms 2.8% 0 s X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
9.00 ms 2.8% 0 s newSeq
1.00 ms 0.3% 0 s amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms 0.3% 1.00 ms newSeq
2.00 ms 0.6% 0 s genericSeqAssign
2.00 ms 0.6% 0 s newSeq
4.00 ms 1.2% 0 s shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms 0.9% 2.00 ms X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms 0.3% 0 s newSeq
1.00 ms 0.3% 0 s genericSeqAssign
Reactions are currently unavailable