对Series或者单列使用one-hot编码

本文探讨了在二分类任务中使用one-hot编码代替传统的sigmoid阈值划分方法。作者分享了使用pandas的get_dummies()函数进行one-hot编码的具体操作,并展示了转换后数据的格式。

需求,我有一列01的label,之前是使用tensorflow直接预测,将输出值使用sigmod之后按照0.5分的,但是效果一般。所以想使用one-hot试试。虽然我也听说二分类不需要one-hot。

操作:直接使用pandas自带的get_dummies()函数。

df_label = pd.get_dummies(df_label)

输出df_label变成若干行两列的dataframe格式。

--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[17], line 89 87 # 使用SMOTE处理不均衡 88 smote = SMOTE(random_state=42) ---> 89 X_res, y_res = smote.fit_resample(X_train, y_train_encoded) 90 print("\n应用SMOTE后的类别分布:", pd.Series(y_res).value_counts()) 92 # 6. 特征编码(分类特征转换为数值) File D:\Anaconda\Lib\site-packages\imblearn\base.py:202, in BaseSampler.fit_resample(self, X, y, **params) 181 def fit_resample(self, X, y, **params): 182 """Resample the dataset. 183 184 Parameters (...) 200 The corresponding label of `X_resampled`. 201 """ --> 202 return super().fit_resample(X, y, **params) File D:\Anaconda\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs) 1466 estimator._validate_params() 1468 with config_context( 1469 skip_parameter_validation=( 1470 prefer_skip_nested_validation or global_skip_validation 1471 ) 1472 ): -> 1473 return fit_method(estimator, *args, **kwargs) File D:\Anaconda\Lib\site-packages\imblearn\base.py:99, in SamplerMixin.fit_resample(self, X, y, **params) 97 check_classification_targets(y) 98 arrays_transformer = ArraysTransformer(X, y) ---> 99 X, y, binarize_y = self._check_X_y(X, y) 101 self.sampling_strategy_ = check_sampling_strategy( 102 self.sampling_strategy, y, self._sampling_type 103 ) 105 output = self._fit_resample(X, y, **params) File D:\Anaconda\Lib\site-packages\imblearn\base.py:157, in BaseSampler._check_X_y(self, X, y, accept_sparse) 155 accept_sparse = ["csr", "csc"] 156 y, binarize_y = check_target_type(y, indicate_one_vs_all=True) --> 157 X, y = validate_data(self, X=X, y=y, reset=True, accept_sparse=accept_sparse) 158 return X, y, binarize_y File D:\Anaconda\Lib\site-packages\imblearn\utils\_sklearn_compat.py:426, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **kwargs) 424 else: 425 force_all_finite = True --> 426 return _estimator._validate_data( 427 X=X, 428 y=y, 429 reset=reset, 430 validate_separately=validate_separately, 431 force_all_finite=force_all_finite, 432 **kwargs, 433 ) File D:\Anaconda\Lib\site-packages\sklearn\base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params) 648 y = check_array(y, input_name="y", **check_y_params) 649 else: --> 650 X, y = check_X_y(X, y, **check_params) 651 out = X, y 653 if not no_val_X and check_params.get("ensure_2d", True): File D:\Anaconda\Lib\site-packages\sklearn\utils\validation.py:1273, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1268 estimator_name = _check_estimator_name(estimator) 1269 raise ValueError( 1270 f"{estimator_name} requires y to be passed, but the target y is None" 1271 ) -> 1273 X = check_array( 1274 X, 1275 accept_sparse=accept_sparse, 1276 accept_large_sparse=accept_large_sparse, 1277 dtype=dtype, 1278 order=order, 1279 copy=copy, 1280 force_all_finite=force_all_finite, 1281 ensure_2d=ensure_2d, 1282 allow_nd=allow_nd, 1283 ensure_min_samples=ensure_min_samples, 1284 ensure_min_features=ensure_min_features, 1285 estimator=estimator, 1286 input_name="X", 1287 ) 1289 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1291 check_consistent_length(X, y) File D:\Anaconda\Lib\site-packages\sklearn\utils\validation.py:1007, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 1005 array = xp.astype(array, dtype, copy=False) 1006 else: -> 1007 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 1008 except ComplexWarning as complex_warning: 1009 raise ValueError( 1010 "Complex data not supported\n{}\n".format(array) 1011 ) from complex_warning File D:\Anaconda\Lib\site-packages\sklearn\utils\_array_api.py:746, in _asarray_with_order(array, dtype, order, copy, xp, device) 744 array = numpy.array(array, order=order, dtype=dtype) 745 else: --> 746 array = numpy.asarray(array, order=order, dtype=dtype) 748 # At this point array is a NumPy ndarray. We convert it to an array 749 # container that is consistent with the input's namespace. 750 return xp.asarray(array) File D:\Anaconda\Lib\site-packages\pandas\core\generic.py:2153, in NDFrame.__array__(self, dtype, copy) 2149 def __array__( 2150 self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None 2151 ) -> np.ndarray: 2152 values = self._values -> 2153 arr = np.asarray(values, dtype=dtype) 2154 if ( 2155 astype_is_view(values.dtype, arr.dtype) 2156 and using_copy_on_write() 2157 and self._mgr.is_single_block 2158 ): 2159 # Check if both conversions can be done without a copy 2160 if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( 2161 values.dtype, arr.dtype 2162 ): ValueError: could not convert string to float: 'Male'
06-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值