dask to_hdf doesn't apply min_itemsize

# Read in a large, messy CSV
bigcsv = dask.dataframe.read_csv('bigmessy.csv')

# Write it out to HDF
bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)

---

ValueError                                Traceback (most recent call last)
<ipython-input-10-b9bf0b8ad6e2> in <module>()
----> 1 bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/core.py in to_hdf(self, path_or_buf, key, mode, append, complevel, complib, fletcher32, *_kwargs)
    509         from .io import to_hdf
    510         return to_hdf(self, path_or_buf, key, mode, append, complevel, complib,
--> 511                 fletcher32, *_kwargs)
    512 
    513     @derived_from(pd.DataFrame)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/io.py in to_hdf(df, path_or_buf, key, mode, append, complevel, complib, fletcher32, get, *_kwargs)
    603 
    604     DataFrame._get(merge(df.dask, dsk), (name, df.npartitions - 1),
--> 605                    get=get_sync, *_kwargs)
    606 
    607 

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/base.py in _get(cls, dsk, keys, get, *_kwargs)
     41         get = get or _globals['get'] or cls._default_get
     42         dsk2 = cls._optimize(dsk, keys, *_kwargs)
---> 43         return get(dsk2, keys, **kwargs)
     44 
     45     @classmethod

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_sync(dsk, keys, *_kwargs)
    514     queue = Queue()
    515     return get_async(apply_sync, 1, dsk, keys, queue=queue,
--> 516                      raise_on_exception=True, *_kwargs)
    517 
    518 

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_async(apply_async, num_workers, dsk, result, cache, queue, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, **kwargs)
    485             f(key, res, dsk, state, worker_id)
    486         while state['ready'] and len(state['running']) < num_workers:
--> 487             fire_task()
    488 
    489     # Final reporting

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in fire_task()
    456         # Submit
    457         apply_async(execute_task, args=[key, dsk[key], data, queue,
--> 458                                         get_id, raise_on_exception])
    459 
    460     # Seed initial tasks into the thread pool

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in apply_sync(func, args, kwds)
    506 def apply_sync(func, args=(), kwds={}):
    507     """ A naive synchronous version of apply_async """
--> 508     return func(_args, *_kwds)
    509 
    510 

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in execute_task(key, task, data, queue, get_id, raise_on_exception)
    262     """
    263     try:
--> 264         result = _execute_task(task, data)
    265         id = get_id()
    266         result = key, result, None, id

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
    243     elif istask(arg):
    244         func, args = arg[0], arg[1:]
--> 245         args2 = [_execute_task(a, cache) for a in args]
    246         return func(*args2)
    247     elif not ishashable(arg):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in <listcomp>(.0)
    243     elif istask(arg):
    244         func, args = arg[0], arg[1:]
--> 245         args2 = [_execute_task(a, cache) for a in args]
    246         return func(*args2)
    247     elif not ishashable(arg):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
    244         func, args = arg[0], arg[1:]
    245         args2 = [_execute_task(a, cache) for a in args]
--> 246         return func(*args2)
    247     elif not ishashable(arg):
    248         return arg

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/compatibility.py in apply(func, args, kwargs)
     24     def apply(func, args, kwargs=None):
     25         if kwargs:
---> 26             return func(_args, *_kwargs)
     27         else:
     28             return func(*args)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, *_kwargs)
    937 
    938         from pandas.io import pytables
--> 939         return pytables.to_hdf(path_or_buf, key, self, *_kwargs)
    940 
    941     def to_msgpack(self, path_or_buf=None, **kwargs):

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
    255         with HDFStore(path_or_buf, mode=mode, complevel=complevel,
    256                        complib=complib) as store:
--> 257             f(store)
    258     else:
    259         f(path_or_buf)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in <lambda>(store)
    248 
    249     if append:
--> 250         f = lambda store: store.append(key, value, *_kwargs)
    251     else:
    252         f = lambda store: store.put(key, value, *_kwargs)

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in append(self, key, value, format, append, columns, dropna, *_kwargs)
    905         kwargs = self._validate_format(format, kwargs)
    906         self._write_to_group(key, value, append=append, dropna=dropna,
--> 907                              *_kwargs)
    908 
    909     def append_to_multiple(self, d, value, selector, data_columns=None,

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, *_kwargs)
   1250 
   1251         # write the object
-> 1252         s.write(obj=value, append=append, complib=complib, *_kwargs)
   1253 
   1254         if s.is_table and index:

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, *_kwargs)
   3755         self.create_axes(axes=axes, obj=obj, validate=append,
   3756                          min_itemsize=min_itemsize,
-> 3757                          *_kwargs)
   3758 
   3759         for a in self.axes:

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
   3432                 self.values_axes.append(col)
   3433             except (NotImplementedError, ValueError, TypeError) as e:
-> 3434                 raise e
   3435             except Exception as detail:
   3436                 raise Exception(

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, *_kwargs)
   3427                              encoding=self.encoding,
   3428                              info=self.info,
-> 3429                              *_kwargs)
   3430                 col.set_pos(j)
   3431 

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
   1814                 min_itemsize,
   1815                 nan_rep,
-> 1816                 encoding)
   1817 
   1818         # set as a data block

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
   1861         # check for column in the values conflicts
   1862         if existing_col is not None:
-> 1863             eci = existing_col.validate_col(itemsize)
   1864             if eci > itemsize:
   1865                 itemsize = eci

/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in validate_col(self, itemsize)
   1556                         "column but\nthis column has a limit of [%s]!\n"
   1557                         "Consider using min_itemsize to preset the sizes on "
-> 1558                         "these columns" % (itemsize, self.cname, c.itemsize))
   1559                 return c.itemsize
   1560 

ValueError: Trying to store a string with len [69] in [values_block_2] column but
this column has a limit of [67]!
Consider using min_itemsize to preset the sizes on these columns

# ... and there is no 'values_block_2' column in the csv dataset either

dask.**version**
'0.7.6'


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

dask to_hdf doesn't apply min_itemsize #941

Read in a large, messy CSV

Write it out to HDF

... and there is no 'values_block_2' column in the csv dataset either

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

dask to_hdf doesn't apply min_itemsize #941

Description

Read in a large, messy CSV

Write it out to HDF

... and there is no 'values_block_2' column in the csv dataset either

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions