提问人:jdbanfill 提问时间:11/15/2023 最后编辑:jdbanfill 更新时间:11/16/2023 访问量:40
为什么我的预处理器会出现数据类型错误?
Why am I getting a data type error from my preprocessor?
问:
我在为我的数据创建预处理器时遇到问题。我的预处理器由一个用于插补 NaN 和缩放值的数值特征管道组成。它还具有用于插补 NaN 和目标编码的分类数据管道。最终的转换器是一个选择器,它保留了满足特定条件的特征。当我将预处理器安装到数据时,出现以下错误:
X_40 = preprocessor_40.fit_transform(X, y)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_33/4000599259.py in ?()
----> 1 X_40 = preprocessor_40.fit_transform(X, y)
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y)
723 self._validate_transformers()
724 self._validate_column_callables(X)
725 self._validate_remainder(X)
726
--> 727 result = self._fit_transform(X, y, _fit_transform_one)
728
729 if not result:
730 self._update_fitted_transformers([])
/opt/conda/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py in ?(self, X, y, func, fitted, column_as_strings)
669 except ValueError as e:
670 if "Expected 2D array, got 1D array instead" in str(e):
671 raise ValueError(_ERR_MSG_1DCOLUMN) from e
672 else:
--> 673 raise
/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, iterable)
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
1859 # If n_jobs==1, run the computation sequentially and return
1860 # immediatly to avoid overheads.
1861 output = self._get_sequential_output(iterable)
1862 next(output)
-> 1863 return output if self.return_generator else list(output)
1864
1865 # Let's create an ID that uniquely identifies the current call. If the
1866 # call is interrupted early and that the same instance is immediately
/opt/conda/lib/python3.10/site-packages/joblib/parallel.py in ?(self, iterable)
1802 finally:
1803 self.print_progress()
1804 self._running = False
1805 self._iterating = False
-> 1806 self._original_iterator = None
/opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py in ?(self, *args, **kwargs)
119 UserWarning,
120 )
121 config = {}
122 with config_context(**config):
--> 123 return self.function(*args, **kwargs)
/opt/conda/lib/python3.10/site-packages/sklearn/pipeline.py in ?(transformer, X, y, weight, message_clsname, message, **fit_params)
889 be multiplied by ``weight``.
890 """
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
896
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_set_output.py in ?(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, **fit_params)
877 # fit method of arity 1 (unsupervised transformation)
878 return self.fit(X, **fit_params).transform(X)
879 else:
880 # fit method of arity 2 (supervised transformation)
--> 881 return self.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.10/site-packages/sklearn/feature_selection/_univariate_selection.py in ?(self, X, y)
463 Returns the instance itself.
464 """
465 self._validate_params()
466
--> 467 X, y = self._validate_data(
468 X, y, accept_sparse=["csr", "csc"], multi_output=True
469 )
470
/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, **check_params)
580 if "estimator" not in check_y_params:
581 check_y_params = {**default_check_params, **check_y_params}
582 y = check_array(y, input_name="y", **check_y_params)
583 else:
--> 584 X, y = check_X_y(X, y, **check_params)
585 out = X, y
586
587 if not no_val_X and check_params.get("ensure_2d", True):
/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
1105
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
876 )
877 array = xp.astype(array, dtype, copy=False)
878 else:
879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 880 except ComplexWarning as complex_warning:
881 raise ValueError(
882 "Complex data not supported\n{}\n".format(array)
883 ) from complex_warning
/opt/conda/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
181 if xp is None:
182 xp, _ = get_namespace(array)
183 if xp.__name__ in {"numpy", "numpy.array_api"}:
184 # Use NumPy API to support order
--> 185 array = numpy.asarray(array, order=order, dtype=dtype)
186 return xp.asarray(array, copy=copy)
187 else:
188 return xp.asarray(array, dtype=dtype, copy=copy)
/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
1996 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
1997 values = self._values
-> 1998 arr = np.asarray(values, dtype=dtype)
1999 if (
2000 astype_is_view(values.dtype, arr.dtype)
2001 and using_copy_on_write()
ValueError: could not convert string to float: 'RL'
我怀疑这是我的列转换器而不是选择器的问题,因为我一直遇到问题,这是我正在尝试的一种选择功能的新方法。但是,我不确定。以下代码是我定义预处理器的方式:
def get_preprocessor(percent, categorical, numerical):
#Pipeline to impute missing values and scale numerical variables
numerical_processes = Pipeline(steps = [('imputer_num', SimpleImputer(strategy = 'constant', fill_value = 0)),
('scaler', StandardScaler())])
#Pipeline to impute missing values and encode categorical variables
categorical_processes = Pipeline(steps = [('imputer_cat', SimpleImputer(strategy = 'constant', fill_value = 'None')),
('encoder', ce.TargetEncoder())])
#Selector to retain only features that meet a certain threshold in an f_regression
selector = SelectPercentile(f_regression, percentile = percent)
#create a preprocessor that wraps up processes for both numerical and categorical variables
Preprocessor = ColumnTransformer(
transformers = [('numeric', numerical_processes, numerical),
('categorical', categorical_processes, categorical),
('selector', selector, numerical + categorical)])
return Preprocessor
preprocessor_full = get_preprocessor(0, categorical, numerical)
preprocessor_40 = get_preprocessor(40, categorical, numerical)
preprocessor_70 = get_preprocessor(70, categorical, numerical)
我已经多次检查了如何定义分类列和数值列,但没有发现任何问题。当它在错误中引用 RL 时,该错误是名为 MSZoning 的列中的值之一,并且以分类形式列出,但不是数字形式。所以我不知道为什么我会收到这个错误,如果它在正确的地方。这也是我的专栏索引中的第一个分类特征,以供参考。
我尝试指定两个不同的选择器在我的管道中工作,而不是将其作为列转换器中的一个步骤,我得到了同样的错误。但是,当我在数字和分类管道中都有选择器时,我用它们来拟合和转换数据帧,而不是将其全部组合到列转换器中,并且它起作用了。我不知道为什么列变压器给我带来问题。我知道你可能会说我应该没有它,但我只想让它工作。
答:
1赞
Ben Reiniger
11/16/2023
#1
ColumnTransformer
并联应用其变压器,水平堆叠输出(不按顺序,就位替换列)。例如,参见 Consistent ColumnTransformer 以获取相交的列列表
因此,您的特征选择转换器获取的是原始分类列,而不是目标编码的列,并且对字符串值犹豫不决。
相反,您应该使用 其第一步是列转换器,其第二步是特征选择。Pipeline
评论