# 数据清洗
# 处理缺失值,如gender为空、非法值(如“8:8”)
user_info['gender'].fillna(-1, inplace=True) # 用-1填充空值
user_info.replace({'gender': {8: -1}}, inplace=True) # 替换非法值
# 检查age_range并处理
user_info['age_range'] = user_info['age_range'].astype(int)
user_info['age_range'] = user_info['age_range'].apply(lambda x: x if 1 <= x <= 8 else 0) # 非法值赋为0
# 时间字段处理
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], errors='coerce') # 注意字段名也应统一
user_log.dropna(subset=['time_stamp'], inplace=True)
# 去重
user_log.drop_duplicates(inplace=True)
# 输出各数据集的数据量(行数、列数)
for name, df in [('user_log', user_log), ('user_info', user_info),
('train_data', train_data), ('test_data', test_data)]:
print(f"{name}: 形状={df.shape} → {df.shape[0]:,} 行 × {df.shape[1]} 列")
报错
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-a190a6f22ca7> in <module>
6
7 # 检查age_range并处理
----> 8 user_info['age_range'] = user_info['age_range'].astype(int)
9 user_info['age_range'] = user_info['age_range'].apply(lambda x: x if 1 <= x <= 8 else 0) # 非法值赋为0
10
/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
/opt/conda/lib/python3.6/site-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/opt/conda/lib/python3.6/site-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/opt/conda/lib/python3.6/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/opt/conda/lib/python3.6/site-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
/opt/conda/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
674
675 if not np.isfinite(arr).all():
--> 676 raise ValueError('Cannot convert non-finite values (NA or inf) to '
677 'integer')
678
ValueError: Cannot convert non-finite values (NA or inf) to integer
最新发布