-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Description
Read in a large, messy CSV
bigcsv = dask.dataframe.read_csv('bigmessy.csv')
Write it out to HDF
bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)
ValueError Traceback (most recent call last)
in ()
----> 1 bigcsv.to_hdf('/tmp/bigcsv.store', key='bigcsv', min_itemsize=100)
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/core.py in to_hdf(self, path_or_buf, key, mode, append, complevel, complib, fletcher32, *_kwargs)
509 from .io import to_hdf
510 return to_hdf(self, path_or_buf, key, mode, append, complevel, complib,
--> 511 fletcher32, *_kwargs)
512
513 @derived_from(pd.DataFrame)
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/dataframe/io.py in to_hdf(df, path_or_buf, key, mode, append, complevel, complib, fletcher32, get, *_kwargs)
603
604 DataFrame._get(merge(df.dask, dsk), (name, df.npartitions - 1),
--> 605 get=get_sync, *_kwargs)
606
607
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/base.py in _get(cls, dsk, keys, get, *_kwargs)
41 get = get or _globals['get'] or cls._default_get
42 dsk2 = cls._optimize(dsk, keys, *_kwargs)
---> 43 return get(dsk2, keys, **kwargs)
44
45 @classmethod
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_sync(dsk, keys, *_kwargs)
514 queue = Queue()
515 return get_async(apply_sync, 1, dsk, keys, queue=queue,
--> 516 raise_on_exception=True, *_kwargs)
517
518
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in get_async(apply_async, num_workers, dsk, result, cache, queue, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, **kwargs)
485 f(key, res, dsk, state, worker_id)
486 while state['ready'] and len(state['running']) < num_workers:
--> 487 fire_task()
488
489 # Final reporting
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in fire_task()
456 # Submit
457 apply_async(execute_task, args=[key, dsk[key], data, queue,
--> 458 get_id, raise_on_exception])
459
460 # Seed initial tasks into the thread pool
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in apply_sync(func, args, kwds)
506 def apply_sync(func, args=(), kwds={}):
507 """ A naive synchronous version of apply_async """
--> 508 return func(_args, *_kwds)
509
510
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in execute_task(key, task, data, queue, get_id, raise_on_exception)
262 """
263 try:
--> 264 result = _execute_task(task, data)
265 id = get_id()
266 result = key, result, None, id
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
243 elif istask(arg):
244 func, args = arg[0], arg[1:]
--> 245 args2 = [_execute_task(a, cache) for a in args]
246 return func(*args2)
247 elif not ishashable(arg):
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in (.0)
243 elif istask(arg):
244 func, args = arg[0], arg[1:]
--> 245 args2 = [_execute_task(a, cache) for a in args]
246 return func(*args2)
247 elif not ishashable(arg):
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/async.py in _execute_task(arg, cache, dsk)
244 func, args = arg[0], arg[1:]
245 args2 = [_execute_task(a, cache) for a in args]
--> 246 return func(*args2)
247 elif not ishashable(arg):
248 return arg
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/dask/compatibility.py in apply(func, args, kwargs)
24 def apply(func, args, kwargs=None):
25 if kwargs:
---> 26 return func(_args, *_kwargs)
27 else:
28 return func(*args)
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, *_kwargs)
937
938 from pandas.io import pytables
--> 939 return pytables.to_hdf(path_or_buf, key, self, *_kwargs)
940
941 def to_msgpack(self, path_or_buf=None, **kwargs):
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, **kwargs)
255 with HDFStore(path_or_buf, mode=mode, complevel=complevel,
256 complib=complib) as store:
--> 257 f(store)
258 else:
259 f(path_or_buf)
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in (store)
248
249 if append:
--> 250 f = lambda store: store.append(key, value, *_kwargs)
251 else:
252 f = lambda store: store.put(key, value, *_kwargs)
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in append(self, key, value, format, append, columns, dropna, *_kwargs)
905 kwargs = self._validate_format(format, kwargs)
906 self._write_to_group(key, value, append=append, dropna=dropna,
--> 907 *_kwargs)
908
909 def append_to_multiple(self, d, value, selector, data_columns=None,
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, index, append, complib, encoding, *_kwargs)
1250
1251 # write the object
-> 1252 s.write(obj=value, append=append, complib=complib, *_kwargs)
1253
1254 if s.is_table and index:
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, *_kwargs)
3755 self.create_axes(axes=axes, obj=obj, validate=append,
3756 min_itemsize=min_itemsize,
-> 3757 *_kwargs)
3758
3759 for a in self.axes:
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, **kwargs)
3432 self.values_axes.append(col)
3433 except (NotImplementedError, ValueError, TypeError) as e:
-> 3434 raise e
3435 except Exception as detail:
3436 raise Exception(
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize, *_kwargs)
3427 encoding=self.encoding,
3428 info=self.info,
-> 3429 *_kwargs)
3430 col.set_pos(j)
3431
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding, **kwargs)
1814 min_itemsize,
1815 nan_rep,
-> 1816 encoding)
1817
1818 # set as a data block
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in set_atom_string(self, block, block_items, existing_col, min_itemsize, nan_rep, encoding)
1861 # check for column in the values conflicts
1862 if existing_col is not None:
-> 1863 eci = existing_col.validate_col(itemsize)
1864 if eci > itemsize:
1865 itemsize = eci
/Applications/anaconda/envs/python3/lib/python3.5/site-packages/pandas/io/pytables.py in validate_col(self, itemsize)
1556 "column but\nthis column has a limit of [%s]!\n"
1557 "Consider using min_itemsize to preset the sizes on "
-> 1558 "these columns" % (itemsize, self.cname, c.itemsize))
1559 return c.itemsize
1560
ValueError: Trying to store a string with len [69] in [values_block_2] column but
this column has a limit of [67]!
Consider using min_itemsize to preset the sizes on these columns
... and there is no 'values_block_2' column in the csv dataset either
dask.version
'0.7.6'