Skip to content

Reinstantiating Setup with Column Transformer does not set back the transformer values #602

@janvanrijn

Description

@janvanrijn

Description

Reinstantiating Model with Column Transformer does not set back the column values

Steps/Code to Reproduce

import logging
import numpy as np
import openml
import sklearn.compose
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.impute
import sklearn.preprocessing

task = openml.tasks.get_task(6)  # use task 2 on live, or task 6 on test


def get_model():
    numeric_transformer = sklearn.pipeline.make_pipeline(
        sklearn.preprocessing.Imputer(),
        sklearn.preprocessing.StandardScaler())

    categorical_transformer = sklearn.pipeline.make_pipeline(
        sklearn.impute.SimpleImputer(strategy='constant', fill_value=-1),
        sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'))

    nominal_indices = task.get_dataset().get_features_by_type('nominal', [task.target_name])
    numeric_indices = task.get_dataset().get_features_by_type('numeric', [task.target_name])

    transformer = sklearn.compose.ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, numeric_indices),
            ('nominal', categorical_transformer, nominal_indices)],
        remainder='passthrough')

    clf = sklearn.pipeline.make_pipeline(transformer,
                                         sklearn.feature_selection.VarianceThreshold(),
                                         sklearn.ensemble.RandomForestClassifier())
    return clf


model_original = get_model()

run = openml.runs.run_model_on_task(
    model_original,
    task,
    avoid_duplicate_runs=False)
run_original = run.publish()  # this implicitly uploads the flow

run_downloaded = openml.runs.get_run(run_original.run_id)
setup_id = run_downloaded.setup_id

model_duplicate = openml.setups.initialize_model(setup_id)

run_duplicate = openml.runs.run_model_on_task(
    model_duplicate, task, avoid_duplicate_runs=False)

Expected Results

Run executed twice,

Actual Results

Traceback (most recent call last):
  File "/home/janvanrijn/projects/openml-python/examples/run_setup_tutorial.py", line 102, in <module>
    model_duplicate, task, avoid_duplicate_runs=False)
  File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 48, in run_model_on_task
    add_local_measures=add_local_measures)
  File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 119, in run_flow_on_task
    res = _run_task_get_arffcontent(flow.model, task, add_local_measures=add_local_measures)
  File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 458, in _run_task_get_arffcontent
    add_local_measures=add_local_measures)
  File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 572, in _run_model_on_fold
    model.fit(trainX, trainY)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 265, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 230, in _fit
    **fit_params_steps[name])
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py", line 329, in __call__
    return self.func(*args, **kwargs)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 614, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/base.py", line 465, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/feature_selection/variance_threshold.py", line 64, in fit
    X = check_array(X, ('csr', 'csc'), dtype=np.float64)
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/utils/validation.py", line 568, in check_array
    allow_nan=force_all_finite == 'allow-nan')
  File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/utils/validation.py", line 56, in _assert_all_finite
    raise ValueError(msg_err.format(type_err, X.dtype))
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

I suspect this is due to the columns have not gone through the column transformer and the 'passthrough' mechanism has been activated.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions