Skip to content

dask to_hdf doesn't apply min_itemsize #941

@user32000

Description

@user32000

Read in a large, messy CSV

bigcsv = dask.dataframe.read_csv('bigmessy.csv')

Write it out to HDF

bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)


ValueError Traceback (most recent call last)
in ()
----> 1 bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/core.py in to_hdf(self, path_or_buf, key, mode, append, complevel, complib, fletcher32, *_kwargs)
509 from .io import to_hdf
510 return to_hdf(self, path_or_buf, key, mode, append, complevel, complib,
--> 511 fletcher32, *_kwargs)
512
513 @derived_from(pd.DataFrame)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/io.py in to_hdf(df, path_or_buf, key, mode, append, complevel, complib, fletcher32, get, *_kwargs)
603
604 DataFrame._get(merge(df.dask, dsk), (name, df.npartitions - 1),
--> 605 get=get_sync, *_kwargs)
606
607

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/base.py in _get(cls, dsk, keys, get, *_kwargs)
41 get = get or _globals['get'] or cls._default_get
42 dsk2 = cls._optimize(dsk, keys, *_kwargs)
---> 43 return get(dsk2, keys, **kwargs)
44
45 @classmethod

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_sync(dsk, keys, *_kwargs)
514 queue = Queue()
515 return get_async(apply_sync, 1, dsk, keys, queue=queue,
--> 516 raise_on_exception=True, *_kwargs)
517
518

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_async(apply_async, num_workers, dsk, result, cache, queue, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, **kwargs)
485 f(key, res, dsk, state, worker_id)
486 while state['ready'] and len(state['running']) < num_workers:
--> 487 fire_task()
488
489 # Final reporting

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in fire_task()
456 # Submit
457 apply_async(execute_task, args=[key, dsk[key], data, queue,
--> 458 get_id, raise_on_exception])
459
460 # Seed initial tasks into the thread pool

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in apply_sync(func, args, kwds)
506 def apply_sync(func, args=(), kwds={}):
507 """ A naive synchronous version of apply_async """
--> 508 return func(_args, *_kwds)
509
510

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in execute_task(key, task, data, queue, get_id, raise_on_exception)
262 """
263 try:
--> 264 result = _execute_task(task, data)
265 id = get_id()
266 result = key, result, None, id

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
243 elif istask(arg):
244 func, args = arg[0], arg[1:]
--> 245 args2 = [_execute_task(a, cache) for a in args]
246 return func(*args2)
247 elif not ishashable(arg):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in (.0)
243 elif istask(arg):
244 func, args = arg[0], arg[1:]
--> 245 args2 = [_execute_task(a, cache) for a in args]
246 return func(*args2)
247 elif not ishashable(arg):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
244 func, args = arg[0], arg[1:]
245 args2 = [_execute_task(a, cache) for a in args]
--> 246 return func(*args2)
247 elif not ishashable(arg):
248 return arg

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/compatibility.py in apply(func, args, kwargs)
24 def apply(func, args, kwargs=None):
25 if kwargs:
---> 26 return func(_args, *_kwargs)
27 else:
28 return func(*args)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, *_kwargs)
937
938 from pandas.io import pytables
--> 939 return pytables.to_hdf(path_or_buf, key, self, *_kwargs)
940
941 def to_msgpack(self, path_or_buf=None, **kwargs):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
255 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
256 complib=complib) as store:
--> 257 f(store)
258 else:
259 f(path_or_buf)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in (store)
248
249 if append:
--> 250 f = lambda store: store.append(key, value, *_kwargs)
251 else:
252 f = lambda store: store.put(key, value, *_kwargs)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in append(self, key, value, format, append, columns, dropna, *_kwargs)
905 kwargs = self._validate_format(format, kwargs)
906 self._write_to_group(key, value, append=append, dropna=dropna,
--> 907 *_kwargs)
908
909 def append_to_multiple(self, d, value, selector, data_columns=None,

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, *_kwargs)
1250
1251 # write the object
-> 1252 s.write(obj=value, append=append, complib=complib, *_kwargs)
1253
1254 if s.is_table and index:

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, *_kwargs)
3755 self.create_axes(axes=axes, obj=obj, validate=append,
3756 min_itemsize=min_itemsize,
-> 3757 *_kwargs)
3758
3759 for a in self.axes:

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3432 self.values_axes.append(col)
3433 except (NotImplementedError, ValueError, TypeError) as e:
-> 3434 raise e
3435 except Exception as detail:
3436 raise Exception(

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, *_kwargs)
3427 encoding=self.encoding,
3428 info=self.info,
-> 3429 *_kwargs)
3430 col.set_pos(j)
3431

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1814 min_itemsize,
1815 nan_rep,
-> 1816 encoding)
1817
1818 # set as a data block

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1861 # check for column in the values conflicts
1862 if existing_col is not None:
-> 1863 eci = existing_col.validate_col(itemsize)
1864 if eci > itemsize:
1865 itemsize = eci

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in validate_col(self, itemsize)
1556 "column but\nthis column has a limit of [%s]!\n"
1557 "Consider using min_itemsize to preset the sizes on "
-> 1558 "these columns" % (itemsize, self.cname, c.itemsize))
1559 return c.itemsize
1560

ValueError: Trying to store a string with len [69] in [values_block_2] column but
this column has a limit of [67]!
Consider using min_itemsize to preset the sizes on these columns

... and there is no 'values_block_2' column in the csv dataset either

dask.version
'0.7.6'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions