Skip to content

Avoid/limit heap allocation/seq (shape/strides) in tight loop #89

@mratsim

Description

@mratsim

Naive benchmarking shows that "shape_to_strides", seq assignation and creation generates a constant non-negligeable overhead - 60% of time spent in seq management for this benchmark:

bench

(available in "benchmarks" folder)

import ../src/arraymancer_nn, ../src/arraymancer_ag, ../src/arraymancer

let ctx = newContext Tensor[float32]

let bsz = 32 #batch size

# We will create a tensor of size 3200 --> 100 batch sizes of 32
# We create it as int between [0, 2[ (2 excluded) and convert to bool
let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)

# Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
proc xor_alt[T](x,y: T): T =
  ## xor is builtin and cannot be passed to map as is
  x xor y

let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1])


# Convert to float and transpose so batch_size is last
let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
let y = y_bool.astype(float32).transpose

# First hidden layer of 3 neurons, with 2 features in
# We initialize with random weights between -1 and 1
let layer_3neurons = ctx.variable(
                      randomTensor(3, 2, 2.0f) .- 1.0f
                      )

# Classifier layer with 1 neuron per feature. (In our case only one neuron overall)
# We initialize with random weights between -1 and 1
let classifier_layer = ctx.variable(
                  randomTensor(1, 3, 2.0f) .- 1.0f
                  )

# Stochastic Gradient Descent
let optim = newSGD[float32](
  layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
)

for epoch in 0..100:

  for batch_id in 0..<100:

    # offset in the Tensor (Remember, batch size is last)
    let offset = batch_id * 32
    let x = x_train[_, offset ..< offset + 32]
    let target = y[_, offset ..< offset + 32]

    # Building the network
    let n1 = linear(x, layer_3neurons)
    let n1_act = n1.relu
    let n2 = linear(n1_act, classifier_layer)
    let loss = sigmoid_cross_entropy(n2, target)

    # Compute the gradient (i.e. contribution of each parameter to the loss)
    loss.backprop()

    # Correct the weights now that we have the gradient information
    optim.update()

Trace

Weight	Self Weight		Symbol Name
192.00 ms   60.3%	0 s	 	ex01_xor (23883)
192.00 ms   60.3%	0 s	 	 Main Thread  0x62524
192.00 ms   60.3%	0 s	 	  start
192.00 ms   60.3%	0 s	 	   main
192.00 ms   60.3%	2.00 ms	 	    NimMainModule
57.00 ms   17.9%	1.00 ms	 	     backprop_oVdk9aMLcybHCChVR6aQk5g
51.00 ms   16.0%	0 s	 	      backward_Fqpw21lFPHWNdNORshaa9cA
6.00 ms    1.8%	2.00 ms	 	       map2_RoKBnD6H0IuZSmQLcCMZ1g
4.00 ms    1.2%	0 s	 	        shape_to_strides_vk3EIHePgu5hL4dxsL38tg
4.00 ms    1.2%	2.00 ms	 	         X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
2.00 ms    0.6%	0 s	 	          newSeq
12.00 ms    3.7%	1.00 ms	 	       map_7sNmoBM0FdFmPtxsYkKirA
2.00 ms    0.6%	0 s	 	        at__7KXK9aqsdE0ndrhB7KxewvA
2.00 ms    0.6%	0 s	 	         newSeq
1.00 ms    0.3%	0 s	 	        nimNewSeqOfCap
8.00 ms    2.5%	0 s	 	        shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms    0.9%	0 s	 	         X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
3.00 ms    0.9%	0 s	 	          newSeq
4.00 ms    1.2%	1.00 ms	 	         amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
3.00 ms    0.9%	2.00 ms	 	          newSeq
1.00 ms    0.3%	0 s	 	         genericSeqAssign
8.00 ms    2.5%	0 s	 	       reversed_S4WoGleqxGOb3jjUvFKyfA
8.00 ms    2.5%	0 s	 	        newSeq
24.00 ms    7.5%	2.00 ms	 	       star__xQtjCZX3EuzWXnc0t9bCM2w
1.00 ms    0.3%	0 s	 	        newSeq
4.00 ms    1.2%	0 s	 	        nimNewSeqOfCap
1.00 ms    0.3%	1.00 ms	 	        setLengthSeq
16.00 ms    5.0%	2.00 ms	 	        unsafeContiguous_Nck5nnO9bAJVgi7JMAI8knA
14.00 ms    4.4%	0 s	 	         genericSeqAssign
1.00 ms    0.3%	0 s	 	       unsafeBroadcast_9aZErpPXuidMn9cbDgghwtrg
1.00 ms    0.3%	0 s	 	        genericSeqAssign
1.00 ms    0.3%	0 s	 	      genericSeqAssign
1.00 ms    0.3%	0 s	 	      newSeq
3.00 ms    0.9%	0 s	 	      shape_to_strides_vk3EIHePgu5hL4dxsL38tg
2.00 ms    0.6%	0 s	 	       amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
2.00 ms    0.6%	0 s	 	        newSeq
1.00 ms    0.3%	0 s	 	       genericSeqAssign
1.00 ms    0.3%	1.00 ms	 	     incrSeqV2
21.00 ms    6.6%	1.00 ms	 	     linear_9aa0crPUgSzc3WGSOrKwanw
20.00 ms    6.2%	0 s	 	      forward_pdrb9bebPpNDv5TauQs8LOgex01_xor
1.00 ms    0.3%	0 s	 	       newSeq
1.00 ms    0.3%	1.00 ms	 	       newSeq_mi9afQ1klNXRFnVSLwJV9aVg
5.00 ms    1.5%	0 s	 	       shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms    0.9%	0 s	 	        X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
3.00 ms    0.9%	0 s	 	         newSeq
1.00 ms    0.3%	0 s	 	        amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms    0.3%	0 s	 	         newSeq
1.00 ms    0.3%	0 s	 	        genericSeqAssign
13.00 ms    4.0%	0 s	 	       star__xQtjCZX3EuzWXnc0t9bCM2w
1.00 ms    0.3%	0 s	 	        newSeq
12.00 ms    3.7%	1.00 ms	 	        unsafeContiguous_Nck5nnO9bAJVgi7JMAI8knA
11.00 ms    3.4%	1.00 ms	 	         genericSeqAssign
1.00 ms    0.3%	0 s	 	     randomTensor_0CLBTaXQo1slbLknroeFow
1.00 ms    0.3%	0 s	 	      shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms    0.3%	0 s	 	       genericSeqAssign
11.00 ms    3.4%	0 s	 	     relu_SZbqcSLLEQfQBKnhXXEa0w
7.00 ms    2.2%	0 s	 	      forward_afTd72d9apMokICtszjdsPAex01_xor
1.00 ms    0.3%	0 s	 	       at__7KXK9aqsdE0ndrhB7KxewvA
1.00 ms    0.3%	0 s	 	        newSeq
4.00 ms    1.2%	2.00 ms	 	       map_7sNmoBM0FdFmPtxsYkKirA
1.00 ms    0.3%	0 s	 	        nimNewSeqOfCap
1.00 ms    0.3%	0 s	 	        shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms    0.3%	0 s	 	         X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms    0.3%	1.00 ms	 	          newSeq
2.00 ms    0.6%	1.00 ms	 	       shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms    0.3%	0 s	 	        genericSeqAssign
4.00 ms    1.2%	0 s	 	      genericSeqAssign
50.00 ms   15.7%	0 s	 	     sigmoid_cross_entropy_Yau9cGp7xu7MB2nxk5jZa9cQ
29.00 ms    9.1%	2.00 ms	 	      forward_eiS5bzXq9cybpN9bAe3jgt5Aex01_xor
2.00 ms    0.6%	0 s	 	       shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms    0.3%	0 s	 	        amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms    0.3%	0 s	 	         newSeq
1.00 ms    0.3%	0 s	 	        newSeq
25.00 ms    7.8%	0 s	 	       toTensor_PDoWBw7dhWertuPrbd3nqQ
8.00 ms    2.5%	0 s	 	        amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
8.00 ms    2.5%	1.00 ms	 	         newSeq
3.00 ms    0.9%	0 s	 	        genericSeqAssign
1.00 ms    0.3%	0 s	 	        incrSeqV2
1.00 ms    0.3%	1.00 ms	 	         growObj_FZeyQYjWPcE9c06y1gNqZxw
4.00 ms    1.2%	2.00 ms	 	        newSeq
9.00 ms    2.8%	0 s	 	        shape_to_strides_vk3EIHePgu5hL4dxsL38tg
1.00 ms    0.3%	0 s	 	         X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms    0.3%	1.00 ms	 	          newSeq
8.00 ms    2.5%	0 s	 	         amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
8.00 ms    2.5%	0 s	 	          newSeq
21.00 ms    6.6%	0 s	 	      genericSeqAssign
26.00 ms    8.1%	2.00 ms	 	     slicer_BD1F1oU9a9cLZM9aHXZ2JbVKw_2
24.00 ms    7.5%	0 s	 	      genericSeqAssign
2.00 ms    0.6%	1.00 ms	 	     unsafeSlicer_BD1F1oU9a9cLZM9aHXZ2JbVKw
1.00 ms    0.3%	0 s	 	      genericSeqAssign
21.00 ms    6.6%	1.00 ms	 	     update_4t6MKNnjrt9b9cUtnIk3Iizg
16.00 ms    5.0%	1.00 ms	 	      map_7sNmoBM0FdFmPtxsYkKirA
1.00 ms    0.3%	0 s	 	       at__7KXK9aqsdE0ndrhB7KxewvA
1.00 ms    0.3%	0 s	 	        newSeq
14.00 ms    4.4%	0 s	 	       shape_to_strides_vk3EIHePgu5hL4dxsL38tg
9.00 ms    2.8%	0 s	 	        X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
9.00 ms    2.8%	0 s	 	         newSeq
1.00 ms    0.3%	0 s	 	        amp__YMHcPoBMZP9bnnIcR8Iy9cUQ
1.00 ms    0.3%	1.00 ms	 	         newSeq
2.00 ms    0.6%	0 s	 	        genericSeqAssign
2.00 ms    0.6%	0 s	 	        newSeq
4.00 ms    1.2%	0 s	 	      shape_to_strides_vk3EIHePgu5hL4dxsL38tg
3.00 ms    0.9%	2.00 ms	 	       X5BX5D__JoZhL7eQinMhVkOHQyuBhQ
1.00 ms    0.3%	0 s	 	        newSeq
1.00 ms    0.3%	0 s	 	       genericSeqAssign

Metadata

Metadata

Assignees

No one assigned

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions