-
-
Notifications
You must be signed in to change notification settings - Fork 211
[MRG] EHN: allow to upload DataFrame and infer dtype and column name #545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
01e1bd2
19fe4cc
2ed1928
8901ee7
efeed09
ceaf990
10a1562
e870733
eb6415b
6132296
08c1496
ee872c5
3aaef38
32fe04e
ccf7b82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,8 @@ | |
| import numpy as np | ||
| import six | ||
| import arff | ||
| import pandas as pd | ||
|
|
||
| import xmltodict | ||
| from scipy.sparse import coo_matrix | ||
| from oslo_concurrency import lockutils | ||
|
|
@@ -359,6 +361,59 @@ def get_dataset(dataset_id): | |
| return dataset | ||
|
|
||
|
|
||
| def attributes_arff_from_df(df): | ||
| """Create the attributes as specified by the ARFF format using a dataframe. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| df : DataFrame, shape (n_samples, n_features) | ||
| The dataframe containing the data set. | ||
|
|
||
| Returns | ||
| ------- | ||
| attributes_arff : str | ||
| The data set attributes as required by the ARFF format. | ||
| """ | ||
| PD_DTYPES_TO_ARFF_DTYPE = { | ||
| 'integer': 'INTEGER', | ||
| 'floating': 'REAL', | ||
| 'string': 'STRING' | ||
| } | ||
| attributes_arff = [] | ||
| for column_name in df: | ||
| # skipna=True does not infer properly the dtype. The NA values are | ||
| # dropped before the inference instead. | ||
| column_dtype = pd.api.types.infer_dtype(df[column_name].dropna()) | ||
|
|
||
| if column_dtype == 'categorical': | ||
| # for categorical feature, arff expects a list string. However, a | ||
| # categorical column can contain mixed type and we should therefore | ||
| # raise an error asking to convert all entries to string. | ||
| categories = df[column_name].cat.categories | ||
| categories_dtype = pd.api.types.infer_dtype(categories) | ||
| if categories_dtype not in ('string', 'unicode'): | ||
| raise ValueError("The column '{}' of the dataframe is of " | ||
| "'category' dtype. Therefore, all values in " | ||
| "this columns should be string. Please " | ||
| "convert the entries which are not string. " | ||
| "Got {} dtype in this column." | ||
| .format(column_name, categories_dtype)) | ||
| attributes_arff.append((column_name, categories.tolist())) | ||
| elif column_dtype == 'boolean': | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not use dtype=bool?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because we are using the |
||
| # boolean are encoded as categorical. | ||
| attributes_arff.append((column_name, ['True', 'False'])) | ||
| elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE.keys(): | ||
| attributes_arff.append((column_name, | ||
| PD_DTYPES_TO_ARFF_DTYPE[column_dtype])) | ||
| else: | ||
| raise ValueError("The dtype '{}' of the column '{}' is not " | ||
| "currently supported by liac-arff. Supported " | ||
| "dtypes are categorical, string, integer, " | ||
| "floating, and boolean." | ||
| .format(column_dtype, column_name)) | ||
| return attributes_arff | ||
|
|
||
|
|
||
| def create_dataset(name, description, creator, contributor, | ||
| collection_date, language, | ||
| licence, attributes, data, | ||
|
|
@@ -394,11 +449,16 @@ def create_dataset(name, description, creator, contributor, | |
| Starts with 1 upper case letter, rest lower case, e.g. 'English'. | ||
| licence : str | ||
| License of the data. | ||
| attributes : list | ||
| attributes : list, dict, or 'auto' | ||
| A list of tuples. Each tuple consists of the attribute name and type. | ||
| data : numpy.ndarray | list | scipy.sparse.coo_matrix | ||
| An array that contains both the attributes and the targets, with | ||
| shape=(n_samples, n_features). | ||
| If passing a pandas DataFrame, the attributes can be automatically | ||
| inferred by passing ``'auto'``. Specific attributes can be manually | ||
| specified by a passing a dictionary where the key is the name of the | ||
| attribute and the value is the data type of the attribute. | ||
| data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features) | ||
| An array that contains both the attributes and the targets. When | ||
| providing a dataframe, the attribute names and type can be inferred by | ||
| passing ``attributes='auto'``. | ||
| The target feature is indicated as meta-data of the dataset. | ||
| default_target_attribute : str | ||
| The default target attribute, if it exists. | ||
|
|
@@ -423,6 +483,24 @@ def create_dataset(name, description, creator, contributor, | |
| class:`openml.OpenMLDataset` | ||
| Dataset description.""" | ||
|
|
||
| if attributes == 'auto' or isinstance(attributes, dict): | ||
| if not hasattr(data, "columns"): | ||
| raise ValueError("Automatically inferring the attributes required " | ||
| "a pandas DataFrame. A {!r} was given instead." | ||
| .format(data)) | ||
| # infer the type of data for each column of the DataFrame | ||
| attributes_ = attributes_arff_from_df(data) | ||
| if isinstance(attributes, dict): | ||
| # override the attributes which was specified by the user | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this do the exact opposite? It overrides the attributes from the dataframe with the arguments passed by the user, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. It is the mechanism that allow the user to overwrite some specific attribute. So it could be useful to force a specific data type or specify the categories (e.g. if there is some missing categories in the data column). |
||
| for attr_idx in range(len(attributes_)): | ||
| attr_name = attributes_[attr_idx][0] | ||
| if attr_name in attributes.keys(): | ||
| attributes_[attr_idx] = (attr_name, attributes[attr_name]) | ||
| else: | ||
| attributes_ = attributes | ||
|
|
||
| data = data.values if hasattr(data, "columns") else data | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to imply that internally we store everything as arff. Wouldn't it be much cooler to store everything internally as pandas, that will make it much easier to do operations on the datasets. (And of course convert before uploading dataset / performing a run). However maybe that should be a next PR.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would say that this is another PR. I would imagine that it would require quite some changes but if the consensus is to use pandas then why not. Then, the data should be only tabular data (which is probably the main use case in openml)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sparse data might be an issue, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sparse arrays do not have If you think about SparseDataFrame, I would say that we should not offer support. They are going to be deprecated in favor of sparse dtype. Is the liac-arff actually usable with sparse data?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it can handle scipy's COO data structure.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this will not be an issue. @amueller could you elaborate on your thoughts. |
||
|
|
||
| if format is not None: | ||
| warn("The format parameter will be deprecated in the future," | ||
| " the method will determine the format of the ARFF " | ||
|
|
@@ -431,8 +509,8 @@ def create_dataset(name, description, creator, contributor, | |
|
|
||
| # Determine ARFF format from the dataset | ||
| else: | ||
| if isinstance(data, list) or isinstance(data, np.ndarray): | ||
| if isinstance(data[0], list) or isinstance(data[0], np.ndarray): | ||
| if isinstance(data, (list, np.ndarray)): | ||
| if isinstance(data[0], (list, np.ndarray)): | ||
| d_format = 'arff' | ||
| elif isinstance(data[0], dict): | ||
| d_format = 'sparse_arff' | ||
|
|
@@ -455,7 +533,7 @@ def create_dataset(name, description, creator, contributor, | |
| arff_object = { | ||
| 'relation': name, | ||
| 'description': description, | ||
| 'attributes': attributes, | ||
| 'attributes': attributes_, | ||
| 'data': data | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,7 @@ | |
| 'nbformat', | ||
| 'python-dateutil', | ||
| 'oslo.concurrency', | ||
| 'pandas>=0.19.2', | ||
| ], | ||
| extras_require={ | ||
| 'test': [ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I missed the discussion in the thread. is this a regular pandas data type? Is there any reason to not use the dtype str?
(the comments below do not seem to help me too much, sorry)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an available dtype in pandas to mention that a column is a categorical column.
Actually I don't see why categories should be string :). Right now, the liac-arff does not allow otherwise, we could actually think about converting silently the categories to string if a column is of category dtype.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool, thanks for the clarification :)