-
-
Notifications
You must be signed in to change notification settings - Fork 211
Closed
Description
Description
Reinstantiating Model with Column Transformer does not set back the column values
Steps/Code to Reproduce
import logging
import numpy as np
import openml
import sklearn.compose
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.impute
import sklearn.preprocessing
task = openml.tasks.get_task(6) # use task 2 on live, or task 6 on test
def get_model():
numeric_transformer = sklearn.pipeline.make_pipeline(
sklearn.preprocessing.Imputer(),
sklearn.preprocessing.StandardScaler())
categorical_transformer = sklearn.pipeline.make_pipeline(
sklearn.impute.SimpleImputer(strategy='constant', fill_value=-1),
sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'))
nominal_indices = task.get_dataset().get_features_by_type('nominal', [task.target_name])
numeric_indices = task.get_dataset().get_features_by_type('numeric', [task.target_name])
transformer = sklearn.compose.ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_indices),
('nominal', categorical_transformer, nominal_indices)],
remainder='passthrough')
clf = sklearn.pipeline.make_pipeline(transformer,
sklearn.feature_selection.VarianceThreshold(),
sklearn.ensemble.RandomForestClassifier())
return clf
model_original = get_model()
run = openml.runs.run_model_on_task(
model_original,
task,
avoid_duplicate_runs=False)
run_original = run.publish() # this implicitly uploads the flow
run_downloaded = openml.runs.get_run(run_original.run_id)
setup_id = run_downloaded.setup_id
model_duplicate = openml.setups.initialize_model(setup_id)
run_duplicate = openml.runs.run_model_on_task(
model_duplicate, task, avoid_duplicate_runs=False)
Expected Results
Run executed twice,
Actual Results
Traceback (most recent call last):
File "/home/janvanrijn/projects/openml-python/examples/run_setup_tutorial.py", line 102, in <module>
model_duplicate, task, avoid_duplicate_runs=False)
File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 48, in run_model_on_task
add_local_measures=add_local_measures)
File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 119, in run_flow_on_task
res = _run_task_get_arffcontent(flow.model, task, add_local_measures=add_local_measures)
File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 458, in _run_task_get_arffcontent
add_local_measures=add_local_measures)
File "/home/janvanrijn/projects/openml-python/openml/runs/functions.py", line 572, in _run_model_on_fold
model.fit(trainX, trainY)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 265, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 230, in _fit
**fit_params_steps[name])
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py", line 329, in __call__
return self.func(*args, **kwargs)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/pipeline.py", line 614, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/base.py", line 465, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/feature_selection/variance_threshold.py", line 64, in fit
X = check_array(X, ('csr', 'csc'), dtype=np.float64)
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/utils/validation.py", line 568, in check_array
allow_nan=force_all_finite == 'allow-nan')
File "/home/janvanrijn/anaconda3/envs/openml-python/lib/python3.6/site-packages/sklearn/utils/validation.py", line 56, in _assert_all_finite
raise ValueError(msg_err.format(type_err, X.dtype))
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
I suspect this is due to the columns have not gone through the column transformer and the 'passthrough' mechanism has been activated.
Metadata
Metadata
Assignees
Labels
No labels