openml · mfeurer · Oct 23, 2018 · Sep 21, 2018 · Sep 21, 2018 · Sep 21, 2018
diff --git a/doc/api.rst b/doc/api.rst
@@ -29,6 +29,7 @@ Top-level Classes
    :toctree: generated/
    :template: function.rst
 
+    attributes_arff_from_df
     check_datasets_active
     create_dataset
     get_dataset

diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py
@@ -5,35 +5,36 @@
 A tutorial on how to create and upload a dataset to OpenML.
 """
 import numpy as np
+import pandas as pd
 import sklearn.datasets
 from scipy.sparse import coo_matrix
 
 import openml
 from openml.datasets.functions import create_dataset
 
 ############################################################################
-# For this tutorial we will upload to the test server to not  pollute the live
+# For this tutorial we will upload to the test server to not pollute the live
 # server with countless copies of the same dataset.
 openml.config.server = 'https://test.openml.org/api/v1/xml'
 
 ############################################################################
-# Below we will cover the following cases of the
-# dataset object:
+# Below we will cover the following cases of the dataset object:
 #
 # * A numpy array
 # * A list
+# * A pandas dataframe
 # * A sparse matrix
 
 ############################################################################
 # Dataset is a numpy array
 # ========================
-# A numpy array can contain lists in the case of dense data
-# or it can contain OrderedDicts in the case of sparse data.
+# A numpy array can contain lists in the case of dense data or it can contain
+# OrderedDicts in the case of sparse data.
 #
 # Prepare dataset
 # ^^^^^^^^^^^^^^^
-# Load an example dataset from scikit-learn which we
-# will upload to OpenML.org via the API.
+# Load an example dataset from scikit-learn which we will upload to OpenML.org
+# via the API.
 
 diabetes = sklearn.datasets.load_diabetes()
 name = 'Diabetes(scikit-learn)'
@@ -43,11 +44,11 @@
 description = diabetes.DESCR
 
 ############################################################################
-# OpenML does not distinguish between the attributes and
-# targets on the data level and stores all data in a single matrix.
+# OpenML does not distinguish between the attributes and targets on the data
+# level and stores all data in a single matrix.
 #
-# The target feature is indicated as meta-data of the
-# dataset (and tasks on that data).
+# The target feature is indicated as meta-data of the dataset (and tasks on
+# that data).
 
 data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
 attribute_names = list(attribute_names)
@@ -67,13 +68,13 @@
 ############################################################################
 # Create the dataset object
 # ^^^^^^^^^^^^^^^^^^^^^^^^^
-# The definition of all fields can be found in the
-# XSD files describing the expected format:
+# The definition of all fields can be found in the XSD files describing the
+# expected format:
 #
 # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
 
 diabetes_dataset = create_dataset(
-    # The name of the dataset (needs to be unique). 
+    # The name of the dataset (needs to be unique).
     # Must not be longer than 128 characters and only contain
     # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
     name=name,
@@ -93,9 +94,11 @@
     licence='BSD (from scikit-learn)',
     # Name of the target. Can also have multiple values (comma-separated).
     default_target_attribute='class',
-    # The attribute that represents the row-id column, if present in the dataset.
+    # The attribute that represents the row-id column, if present in the
+    # dataset.
     row_id_attribute=None,
-    # Attributes that should be excluded in modelling, such as identifiers and indexes.
+    # Attributes that should be excluded in modelling, such as identifiers and
+    # indexes.
     ignore_attribute=None,
     # How to cite the paper.
     citation=citation,
@@ -118,8 +121,8 @@
 ############################################################################
 # Dataset is a list
 # =================
-# A list can contain lists in the case of dense data
-# or it can contain OrderedDicts in the case of sparse data.
+# A list can contain lists in the case of dense data or it can contain
+# OrderedDicts in the case of sparse data.
 #
 # Weather dataset:
 # http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
@@ -188,6 +191,52 @@
 upload_did = weather_dataset.publish()
 print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
 
+############################################################################
+# Dataset is a pandas DataFrame
+# =============================
+# It might happen that your dataset is made of heterogeneous data which can be
+# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to
+# store the type of data for each column as well as the attribute names.
+# Therefore, when providing a Pandas DataFrame, OpenML can infer those
+# information without the need to specifically provide them when calling the
+# function :func:`create_dataset`. In this regard, you only need to pass
+# ``'auto'`` to the ``attributes`` parameter.
+
+df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
+# enforce the categorical column to have a categorical dtype
+df['outlook'] = df['outlook'].astype('category')
+df['windy'] = df['windy'].astype('bool')
+df['play'] = df['play'].astype('category')
+print(df.info())
+
+############################################################################
+# We enforce the column 'outlook', 'winday', and 'play' to be a categorical
+# dtype while the column 'rnd_str' is kept as a string column. Then, we can
+# call :func:`create_dataset` by passing the dataframe and fixing the parameter
+# ``attributes`` to ``'auto'``.
+
+weather_dataset = create_dataset(
+    name="Weather",
+    description=description,
+    creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
+    contributor=None,
+    collection_date='01-01-2011',
+    language='English',
+    licence=None,
+    default_target_attribute='play',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=citation,
+    attributes='auto',
+    data=df,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = weather_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
 ############################################################################
 # Dataset is a sparse matrix
 # ==========================

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -6,6 +6,8 @@
 import numpy as np
 import six
 import arff
+import pandas as pd
+
 import xmltodict
 from scipy.sparse import coo_matrix
 from oslo_concurrency import lockutils
@@ -359,6 +361,59 @@ def get_dataset(dataset_id):
     return dataset
 
 
+def attributes_arff_from_df(df):
+    """Create the attributes as specified by the ARFF format using a dataframe.
+
+    Parameters
+    ----------
+    df : DataFrame, shape (n_samples, n_features)
+        The dataframe containing the data set.
+
+    Returns
+    -------
+    attributes_arff : str
+        The data set attributes as required by the ARFF format.
+    """
+    PD_DTYPES_TO_ARFF_DTYPE = {
+        'integer': 'INTEGER',
+        'floating': 'REAL',
+        'string': 'STRING'
+    }
+    attributes_arff = []
+    for column_name in df:
+        # skipna=True does not infer properly the dtype. The NA values are
+        # dropped before the inference instead.
+        column_dtype = pd.api.types.infer_dtype(df[column_name].dropna())
+
+        if column_dtype == 'categorical':
+            # for categorical feature, arff expects a list string. However, a
+            # categorical column can contain mixed type and we should therefore
+            # raise an error asking to convert all entries to string.
+            categories = df[column_name].cat.categories
+            categories_dtype = pd.api.types.infer_dtype(categories)
+            if categories_dtype not in ('string', 'unicode'):
+                raise ValueError("The column '{}' of the dataframe is of "
+                                 "'category' dtype. Therefore, all values in "
+                                 "this columns should be string. Please "
+                                 "convert the entries which are not string. "
+                                 "Got {} dtype in this column."
+                                 .format(column_name, categories_dtype))
+            attributes_arff.append((column_name, categories.tolist()))
+        elif column_dtype == 'boolean':
+            # boolean are encoded as categorical.
+            attributes_arff.append((column_name, ['True', 'False']))
+        elif column_dtype in PD_DTYPES_TO_ARFF_DTYPE.keys():
+            attributes_arff.append((column_name,
+                                    PD_DTYPES_TO_ARFF_DTYPE[column_dtype]))
+        else:
+            raise ValueError("The dtype '{}' of the column '{}' is not "
+                             "currently supported by liac-arff. Supported "
+                             "dtypes are categorical, string, integer, "
+                             "floating, and boolean."
+                             .format(column_dtype, column_name))
+    return attributes_arff
+
+
 def create_dataset(name, description, creator, contributor,
                    collection_date, language,
                    licence, attributes, data,
@@ -394,11 +449,16 @@ def create_dataset(name, description, creator, contributor,
         Starts with 1 upper case letter, rest lower case, e.g. 'English'.
     licence : str
         License of the data.
-    attributes : list
+    attributes : list, dict, or 'auto'
         A list of tuples. Each tuple consists of the attribute name and type.
-    data : numpy.ndarray | list | scipy.sparse.coo_matrix
-        An array that contains both the attributes and the targets, with
-        shape=(n_samples, n_features).
+        If passing a pandas DataFrame, the attributes can be automatically
+        inferred by passing ``'auto'``. Specific attributes can be manually
+        specified by a passing a dictionary where the key is the name of the
+        attribute and the value is the data type of the attribute.
+    data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
+        An array that contains both the attributes and the targets. When
+        providing a dataframe, the attribute names and type can be inferred by
+        passing ``attributes='auto'``.
         The target feature is indicated as meta-data of the dataset.
     default_target_attribute : str
         The default target attribute, if it exists.
@@ -423,6 +483,24 @@ def create_dataset(name, description, creator, contributor,
     class:`openml.OpenMLDataset`
         Dataset description."""
 
+    if attributes == 'auto' or isinstance(attributes, dict):
+        if not hasattr(data, "columns"):
+            raise ValueError("Automatically inferring the attributes required "
+                             "a pandas DataFrame. A {!r} was given instead."
+                             .format(data))
+        # infer the type of data for each column of the DataFrame
+        attributes_ = attributes_arff_from_df(data)
+        if isinstance(attributes, dict):
+            # override the attributes which was specified by the user
+            for attr_idx in range(len(attributes_)):
+                attr_name = attributes_[attr_idx][0]
+                if attr_name in attributes.keys():
+                    attributes_[attr_idx] = (attr_name, attributes[attr_name])
+    else:
+        attributes_ = attributes
+
+    data = data.values if hasattr(data, "columns") else data
+
     if format is not None:
         warn("The format parameter will be deprecated in the future,"
              " the method will determine the format of the ARFF "
@@ -431,8 +509,8 @@ def create_dataset(name, description, creator, contributor,
 
     # Determine ARFF format from the dataset
     else:
-        if isinstance(data, list) or isinstance(data, np.ndarray):
-            if isinstance(data[0], list) or isinstance(data[0], np.ndarray):
+        if isinstance(data, (list, np.ndarray)):
+            if isinstance(data[0], (list, np.ndarray)):
                 d_format = 'arff'
             elif isinstance(data[0], dict):
                 d_format = 'sparse_arff'
@@ -455,7 +533,7 @@ def create_dataset(name, description, creator, contributor,
     arff_object = {
         'relation': name,
         'description': description,
-        'attributes': attributes,
+        'attributes': attributes_,
         'data': data
     }
 

diff --git a/setup.py b/setup.py
@@ -45,6 +45,7 @@
                      'nbformat',
                      'python-dateutil',
                      'oslo.concurrency',
+                     'pandas>=0.19.2',
                  ],
                  extras_require={
                      'test': [