Skip to content

Cannot load model after training in Sagemaker #438

@sescapa

Description

@sescapa

I created a MXNET model using sagemaker, and I want to use the model.tar.gz file that is created in the output s3 bucket.

The directory has

  • model_algo_1-0000.params
  • model_algo_1-symbol.json
  • hyperparams.json

I'm trying the following code:

sym, arg_params, aux_params = mx.model.load_checkpoint('model_algo_1', 0) mod = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names = None) mod.bind(for_training=False, data_shapes=[('data', (1,3, 480,640))], label_shapes=mod._label_shapes)

However, I keep getting the error:

`

MXNetError Traceback (most recent call last)
~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/symbol/symbol.py in simple_bind(self, ctx, grad_req, type_dict, stype_dict, group2ctx, shared_arg_names, shared_exec, shared_buffer, **kwargs)
1521 shared_exec_handle,
-> 1522 ctypes.byref(exe_handle)))
1523 except MXNetError as e:

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/base.py in check_call(ret)
251 if ret != 0:
--> 252 raise MXNetError(py_str(_LIB.MXGetLastError()))
253

MXNetError: Error in operator multibox_target: [02:34:37] src/operator/contrib/./multibox_target-inl.h:225: Check failed: lshape.ndim() == 3 (0 vs. 3) Label should be [batch-num_labels-(>=5)] tensor

Stack trace returned 10 entries:
[bt] (0) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36161a) [0x7fc47fd3a61a]
[bt] (1) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x361c31) [0x7fc47fd3ac31]
[bt] (2) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x4b8421) [0x7fc47fe91421]
[bt] (3) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cbd510) [0x7fc482696510]
[bt] (4) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aad5ea) [0x7fc4824865ea]
[bt] (5) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aaff3d) [0x7fc482488f3d]
[bt] (6) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a9c55a) [0x7fc48247555a]
[bt] (7) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a9d094) [0x7fc482476094]
[bt] (8) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2378) [0x7fc4823cdf28]
[bt] (9) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fc4f3fbcec0]

During handling of the above exception, another exception occurred:

RuntimeError Traceback (most recent call last)
in ()
4 mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
5 mod.bind(for_training=False, data_shapes=[('data', (1,3,480,640))],
----> 6 label_shapes=mod._label_shapes)
7 mod.set_params(arg_params, aux_params, allow_missing=True)
8 with open('synset.txt', 'r') as f:

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/module/module.py in bind(self, data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind, shared_module, grad_req)
427 fixed_param_names=self._fixed_param_names,
428 grad_req=grad_req, group2ctxs=self._group2ctxs,
--> 429 state_names=self._state_names)
430 self._total_exec_bytes = self._exec_group._total_exec_bytes
431 if shared_module is not None:

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/module/executor_group.py in init(self, symbol, contexts, workload, data_shapes, label_shapes, param_names, for_training, inputs_need_grad, shared_group, logger, fixed_param_names, grad_req, state_names, group2ctxs)
277 self.num_outputs = len(self.symbol.list_outputs())
278
--> 279 self.bind_exec(data_shapes, label_shapes, shared_group)
280
281 def decide_slices(self, data_shapes):

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/module/executor_group.py in bind_exec(self, data_shapes, label_shapes, shared_group, reshape)
373 else:
374 self.execs.append(self._bind_ith_exec(i, data_shapes_i, label_shapes_i,
--> 375 shared_group))
376
377 self.data_shapes = data_shapes

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/module/executor_group.py in _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group)
660 type_dict=input_types, shared_arg_names=self.param_names,
661 shared_exec=shared_exec, group2ctx=group2ctx,
--> 662 shared_buffer=shared_data_arrays, **input_shapes)
663 self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
664 return executor

~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/symbol/symbol.py in simple_bind(self, ctx, grad_req, type_dict, stype_dict, group2ctx, shared_arg_names, shared_exec, shared_buffer, **kwargs)
1526 error_msg += "%s: %s\n" % (k, v)
1527 error_msg += "%s" % e
-> 1528 raise RuntimeError(error_msg)
1529
1530 # update shared_buffer

RuntimeError: simple_bind error. Arguments:
data: (1, 3, 480, 640)
Error in operator multibox_target: [02:34:37] src/operator/contrib/./multibox_target-inl.h:225: Check failed: lshape.ndim() == 3 (0 vs. 3) Label should be [batch-num_labels-(>=5)] tensor

Stack trace returned 10 entries:
[bt] (0) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36161a) [0x7fc47fd3a61a]
[bt] (1) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x361c31) [0x7fc47fd3ac31]
[bt] (2) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x4b8421) [0x7fc47fe91421]
[bt] (3) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cbd510) [0x7fc482696510]
[bt] (4) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aad5ea) [0x7fc4824865ea]
[bt] (5) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2aaff3d) [0x7fc482488f3d]
[bt] (6) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a9c55a) [0x7fc48247555a]
[bt] (7) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a9d094) [0x7fc482476094]
[bt] (8) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2378) [0x7fc4823cdf28]
[bt] (9) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fc4f3fbcec0]`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions