提问人:PRADDYUMN YADAV 提问时间:11/11/2023 最后编辑:Ian ThompsonPRADDYUMN YADAV 更新时间:11/18/2023 访问量:89
AttributeError:“numpy.ndarray”对象没有属性“columns”->,当我甚至没有将 dataframe 更改为 np.ndarry 时
AttributeError: 'numpy.ndarray' object has no attribute 'columns' -> when I didn't even change dataframe into np.ndarry
问:
目前,我正在尝试破解 Kaggle 的宇宙飞船泰坦尼克号数据集,当我这样做时,我遇到了一个错误(如下)。在下面的代码中,是从中移除转换要素后的相同数据集。X
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
class Cleaner(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.drop(["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"], axis=1)
return X
class ObjectEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
columns_to_encode = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
encoder = OneHotEncoder()
# Encode each attribute and add the columns to the DataFrame
for column in columns_to_encode:
matrix = encoder.fit_transform(X[[column]]).toarray()
column_names = [f"{column}_{i}" for i in range(matrix.shape[1])]
for i in range(len(matrix.T)):
X[column_names[i]] = matrix.T[i]
# Remove Old Parameters
X = X.drop(["HomePlanet", "CryoSleep", "Destination", "VIP"], axis=1)
return X
class LabelYEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
return y_encoded
class NullImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
features_with_missing_values = X.columns[X.isnull().any()].tolist()
imputer = SimpleImputer(strategy='median')
X[features_with_missing_values] = imputer.fit_transform(X[features_with_missing_values])
return X
pipeline = Pipeline([
("cleaner", Cleaner()),
("object_encoder", ObjectEncoder()),
("label_y_encoder", LabelYEncoder()),
("null_imputer", NullImputer()),
])
X = pipeline.fit_transform(strat_train_set)
此代码产生错误:-
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[21], line 1
----> 1 X = pipeline.fit_transform(strat_train_set)
File ~/Projects/Cracking-Spaceship-Titanic/venv/lib/python3.10/site-packages/sklearn/base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1145 estimator._validate_params()
1147 with config_context(
1148 skip_parameter_validation=(
1149 prefer_skip_nested_validation or global_skip_validation
1150 )
1151 ):
-> 1152 return fit_method(estimator, *args, **kwargs)
File ~/Projects/Cracking-Spaceship-Titanic/venv/lib/python3.10/site-packages/sklearn/pipeline.py:479, in Pipeline.fit_transform(self, X, y, **fit_params)
477 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
478 if hasattr(last_step, "fit_transform"):
--> 479 return last_step.fit_transform(Xt, y, **fit_params_last_step)
480 else:
481 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File ~/Projects/Cracking-Spaceship-Titanic/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
155 @wraps(f)
156 def wrapped(self, X, *args, **kwargs):
--> 157 data_to_wrap = f(self, X, *args, **kwargs)
158 if isinstance(data_to_wrap, tuple):
159 # only wrap the first output for cross decomposition
160 return_tuple = (
161 _wrap_data_with_container(method, data_to_wrap[0], X, self),
162 *data_to_wrap[1:],
163 )
File ~/Projects/Cracking-Spaceship-Titanic/venv/lib/python3.10/site-packages/sklearn/base.py:916, in TransformerMixin.fit_transform(self, X, y, **fit_params)
912 # non-optimized default implementation; override when a better
913 # method is possible for a given clustering algorithm
914 if y is None:
915 # fit method of arity 1 (unsupervised transformation)
--> 916 return self.fit(X, **fit_params).transform(X)
917 else:
918 # fit method of arity 2 (supervised transformation)
919 return self.fit(X, y, **fit_params).transform(X)
File ~/Projects/Cracking-Spaceship-Titanic/venv/lib/python3.10/site-packages/sklearn/utils/_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
155 @wraps(f)
156 def wrapped(self, X, *args, **kwargs):
--> 157 data_to_wrap = f(self, X, *args, **kwargs)
158 if isinstance(data_to_wrap, tuple):
159 # only wrap the first output for cross decomposition
160 return_tuple = (
161 _wrap_data_with_container(method, data_to_wrap[0], X, self),
162 *data_to_wrap[1:],
163 )
Cell In[19], line 7, in NullImputer.transform(self, X)
6 def transform(self, X):
----> 7 features_with_missing_values = X.columns[X.isnull().any()].tolist()
8 imputer = SimpleImputer(strategy='median')
9 X[features_with_missing_values] = imputer.fit_transform(X[features_with_missing_values])
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
答:
来自 sklearn-template 文档:
请注意,转换仅将 X 作为输入,并应返回 X 的转换版本
所以现在在你的类的方法中,数据帧被替换为transform
LabelYEncoder
X
y_encoded
'numpy.ndarray'
看起来您正在尝试为预处理管道构建一些自定义类。我不知道您使用的是什么版本,但之前的任何内容都会尝试将输入转换为.set_output
方法更改了此行为。scikit-learn
1.2.X
np.ndarray
scikit-learn
1.3.2
如果您能够升级,则您引发的错误将更改为:
class Cleaner(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.drop(["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"], axis=1)
return X
class ObjectEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
columns_to_encode = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
encoder = OneHotEncoder()
# Encode each attribute and add the columns to the DataFrame
for column in columns_to_encode:
matrix = encoder.fit_transform(X[[column]]).toarray()
column_names = [f"{column}_{i}" for i in range(matrix.shape[1])]
for i in range(len(matrix.T)):
X[column_names[i]] = matrix.T[i]
# Remove old parameters
X = X.drop(["HomePlanet", "CryoSleep", "Destination", "VIP"], axis=1)
return X
class LabelYEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
return y_encoded
class NullImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
features_with_missing_values = X.columns[X.isnull().any()].tolist()
imputer = SimpleImputer(strategy="median")
X[features_with_missing_values] = imputer.fit_transform(X[features_with_missing_values])
return X
pipeline = Pipeline(
steps=[
("cleaner", Cleaner()),
("object_encoder", ObjectEncoder()),
("label_y_encoder", LabelYEncoder()),
("null_imputer", NullImputer()),
]
)
df = pd.read_csv("train.csv")
# "In the code below X is the same dataset after removing the Transformed Feature from it."
# Idk what "Transformed Feature" is, but I assume you mean "Transported"
strat_train_set = df.drop(columns="Transported")
X = pipeline.fit_transform(strat_train_set)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 61
57 # "In the code below X is the same dataset after removing the Transformed Feature from it."
58 # Idk what "Transformed Feature" is, but I assume you mean "Transported"
59 strat_train_set = df.drop(columns="Transported")
---> 61 X = pipeline.fit_transform(strat_train_set)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1145 estimator._validate_params()
1147 with config_context(
1148 skip_parameter_validation=(
1149 prefer_skip_nested_validation or global_skip_validation
1150 )
1151 ):
-> 1152 return fit_method(estimator, *args, **kwargs)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\pipeline.py:471, in Pipeline.fit_transform(self, X, y, **fit_params)
444 """Fit the model and transform with the final estimator.
445
446 Fits all the transformers one after the other and transform the
(...)
468 Transformed samples.
469 """
470 fit_params_steps = self._check_fit_params(**fit_params)
--> 471 Xt = self._fit(X, y, **fit_params_steps)
473 last_step = self._final_estimator
474 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\pipeline.py:377, in Pipeline._fit(self, X, y, **fit_params_steps)
375 cloned_transformer = clone(transformer)
376 # Fit or load from cache the current transformer
--> 377 X, fitted_transformer = fit_transform_one_cached(
378 cloned_transformer,
379 X,
380 y,
381 None,
382 message_clsname="Pipeline",
383 message=self._log_message(step_idx),
384 **fit_params_steps[name],
385 )
386 # Replace the transformer of the step with the fitted
387 # transformer. This is necessary when loading the transformer
388 # from the cache.
389 self.steps[step_idx] = (name, fitted_transformer)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\joblib\memory.py:353, in NotMemorizedFunc.__call__(self, *args, **kwargs)
352 def __call__(self, *args, **kwargs):
--> 353 return self.func(*args, **kwargs)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\pipeline.py:957, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
955 with _print_elapsed_time(message_clsname, message):
956 if hasattr(transformer, "fit_transform"):
--> 957 res = transformer.fit_transform(X, y, **fit_params)
958 else:
959 res = transformer.fit(X, y, **fit_params).transform(X)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\utils\_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
155 @wraps(f)
156 def wrapped(self, X, *args, **kwargs):
--> 157 data_to_wrap = f(self, X, *args, **kwargs)
158 if isinstance(data_to_wrap, tuple):
159 # only wrap the first output for cross decomposition
160 return_tuple = (
161 _wrap_data_with_container(method, data_to_wrap[0], X, self),
162 *data_to_wrap[1:],
163 )
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\base.py:916, in TransformerMixin.fit_transform(self, X, y, **fit_params)
912 # non-optimized default implementation; override when a better
913 # method is possible for a given clustering algorithm
914 if y is None:
915 # fit method of arity 1 (unsupervised transformation)
--> 916 return self.fit(X, **fit_params).transform(X)
917 else:
918 # fit method of arity 2 (supervised transformation)
919 return self.fit(X, y, **fit_params).transform(X)
File ~\Sandbox\PRIVATE-RAI\env\Lib\site-packages\sklearn\utils\_set_output.py:157, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
155 @wraps(f)
156 def wrapped(self, X, *args, **kwargs):
--> 157 data_to_wrap = f(self, X, *args, **kwargs)
158 if isinstance(data_to_wrap, tuple):
159 # only wrap the first output for cross decomposition
160 return_tuple = (
161 _wrap_data_with_container(method, data_to_wrap[0], X, self),
162 *data_to_wrap[1:],
163 )
Cell In[21], line 33, in LabelYEncoder.transform(self, X)
31 def transform(self, X):
32 label_encoder = LabelEncoder()
---> 33 y_encoded = label_encoder.fit_transform(y)
34 return y_encoded
NameError: name 'y' is not defined
您可以通过编辑 :method: 来接受参数来解决这个问题。但就我个人而言,我会修改您的自定义类以使其更通用,并使用 a 来控制每个步骤的输入。如下所示:LabelYEncoder.transform
y
ColumnTransformer
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
# Separating `X` and `y`
df = pd.read_csv("train.csv")
X = df.drop(columns="Transported")
y = df.Transported
cols_to_drop = ["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"]
cols_to_one_hot = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
# Remove the `cols_to_drop` and `cols_to_one_hot`.
# The remainder will be passed through via the identity function.
cols_to_keep = X.drop(columns=[*cols_to_drop, *cols_to_one_hot]).columns
prep_trf = ColumnTransformer(
transformers=[
# One hot encode the `cols_to_one_hot`
("one_hot", OneHotEncoder(sparse_output=False), cols_to_one_hot),
# Impute the `cols_to_keep`
("imputer", SimpleImputer(strategy="median"), cols_to_keep),
],
# Drop the remaining columns not mentioned in
# `cols_to_one_hot` or `cols_to_keep`
remainder="drop",
verbose_feature_names_out=False,
# Transform output to `pandas.DataFrame` (optional)
).set_output(transform="pandas")
# Apply the transformations to `X`.
X = prep_trf.fit_transform(X)
调用将处理输入要素的所有预处理。因为只接受输入,不支持 :method:,所以你不能把它添加到对象中。事实上,从技术上讲,如果没有一些自定义代码,就无法将其添加到管道中。相反,您应该独立转换变量,或者完全忽略它(因为它已经是具有布尔值的二进制形式)。prep_trf.fit_transform
X
LabelEncoder
y
set_output
prep_trf
y
scikit-learn
1.1.3 (< 1.2.X)
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder
# Separating `X` and `y`
df = pd.read_csv("train.csv")
X = df.drop(columns="Transported")
y = df.Transported
cols_to_drop = ["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"]
cols_to_one_hot = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
# Remove the `cols_to_drop` and `cols_to_one_hot`.
# The remainder will be passed through via the identity function.
cols_to_keep = X.drop(columns=[*cols_to_drop, *cols_to_one_hot]).columns
prep_trf = ColumnTransformer(
transformers=[
# One hot encode the `cols_to_one_hot`
# NOTE: had to change the `sparse_output` parameter to `sparse`.
("one_hot", OneHotEncoder(sparse=False), cols_to_one_hot),
# Impute the `cols_to_keep`
("imputer", SimpleImputer(strategy="median"), cols_to_keep),
],
# Drop the remaining columns not mentioned in
# `cols_to_one_hot` or `cols_to_keep`
remainder="drop",
verbose_feature_names_out=False,
# NOTE: :method:`set_output` does not exist in scikit-learn version < 1.2.X
)
# Apply the transformations to `X`.
X_ = prep_trf.fit_transform(X)
因为 1.1.3 版本没有 :method:,我们需要重新创建自己:scikit-learn
set_output
pandas.DataFrame
X_v1_1_3 = pd.DataFrame(
data=X_,
# Track the index just incase it was modified earlier.
# This can affect `joins`.
index=X.index,
# Use the :method:`prep_trf_get_feature_names_out` to get the
# correct column names.
columns=prep_trf.get_feature_names_out(),
)
在这两种情况下,输出将是相同的。在两个版本中分别应用 to。LabelEncoder
y
其他资源:
评论
sklearn