cdh5.14重启hdfs报无法创建/tmp/.cloudera_health_monitoring_canary_files 解决办法

本文介绍如何在Hadoop中正确关闭安全模式,以及当遇到'JAVA_HOME未设置'错误时的解决方法。通过详细步骤指导,帮助读者理解JAVA_HOME环境变量的重要性,并演示如何设置以确保Hadoop集群正常运行。

[root@masternode ~]# sudo -u hdfs hdfs dfsadmin -safemode leave
Safe mode is OFF

 

上述命令可能会报:

Error: JAVA_HOME is not set and could not be found.

解决办法:

[hadoop@masternode ~]$ sudo -i
[root@masternode ~]# mkdir -p /usr/java
[root@masternode ~]# ln -s /usr/local/jdk1.8.0_171 /usr/java/default

from dask_yarn import YarnCluster from dask.distributed import Client, get_worker from distributed.protocol import serialize, deserialize import pandas as pd from pandas.core.arrays.string_ import StringDtype import dask.dataframe as dd import dask # 设置client环境变量 import os os.environ[ 'HADOOP_CONF_DIR'] = '/home/finance/App/jupyter-ide-tech.msxf.lo/.IDE/work/user_data/train_sdk/test/dask/dask_hadoop_conf_new/' os.environ['PATH'] = os.environ['PATH'] + ':/opt/cloudera/parcels/CDH/bin/' os.environ['HADOOP_HOME'] = '/opt/jupyter/3.1.5.0-152/hadoop/' os.environ['LD_LIBRARY_PATH'] = '/opt/jupyter/3.1.5.0-152/hadoop/lib/native' os.environ[ 'CLASSPATH'] = '/opt/jupyter/3.1.5.0-152/hadoop/lib/*:/opt/jupyter/3.1.5.0-152/hadoop/.//*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/./:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/.//*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/.//*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/./:/opt/jupyter/3.1.5.0-152/hadoop-yarn/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/lib/*:/usr/hdp/3.1.5.0-152/hadoop/.//*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/./:/usr/hdp/3.1.5.0-152/hadoop-hdfs/lib/*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/.//*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/lib/*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/.//*:/usr/hdp/3.1.5.0-152/hadoop-yarn/./:/usr/hdp/3.1.5.0-152/hadoop-yarn/lib/*:/usr/hdp/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/conf:/home/finance/App/jupyter-ide-tech.msxf.lo/.IDE/work/user_data/train_sdk/test/dask/dask_hadoop_conf/' def set_scheduler_env(): os.environ[ 'CLASSPATH'] = '/opt/jupyter/3.1.5.0-152/hadoop/lib/*:/opt/jupyter/3.1.5.0-152/hadoop/.//*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/./:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/.//*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/.//*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/./:/opt/jupyter/3.1.5.0-152/hadoop-yarn/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/lib/*:/usr/hdp/3.1.5.0-152/hadoop/.//*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/./:/usr/hdp/3.1.5.0-152/hadoop-hdfs/lib/*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/.//*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/lib/*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/.//*:/usr/hdp/3.1.5.0-152/hadoop-yarn/./:/usr/hdp/3.1.5.0-152/hadoop-yarn/lib/*:/usr/hdp/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/conf:/home/finance/App/jupyter-ide-tech.msxf.lo/.IDE/work/user_data/train_sdk/test/dask/dask_hadoop_conf/' # os.environ['DASK_DISTRIBUTED__DASHBOARD__PORT'] = '8042' return f"环境已设置: {os.environ['CLASSPATH']}" # 配置使用 msgpack 序列化 dask.config.set({'distributed.scheduler.worker-ttl': None}) dask.config.set({'distributed.comm.serialize': 'msgpack'}) # 或者使用 pickle5 dask.config.set({'distributed.comm.serialize': 'pickle5'}) import pandas as pd from pandas.core.arrays.string_ import StringDtype import cloudpickle def apply_compat_patch(): """应用兼容性补丁""" if not hasattr(YarnCluster, 'status'): def status(self): try: return self._state except AttributeError: return getattr(self, '_status', 'running') YarnCluster.status = property(status) apply_compat_patch() def patch_string_dtype(): """ 修复 StringDtype 对象缺少 _na_value 属性的问题 """ # 获取原始的 StringDtype 类 original_string_dtype = StringDtype # 定义补丁后的 StringDtype 类 class PatchedStringDtype(StringDtype): @property def na_value(self): # 直接返回 NA 值,而不是尝试获取 _na_value 属性 return self._na_value def __reduce__(self): # 确保在 pickling 时不会尝试获取 _na_value 属性 return StringDtype, (self.storage, self.na_value) # 替换原始的 StringDtype 类 pd.core.arrays.string_.StringDtype = PatchedStringDtype # 确保 cloudpickle 能够正确处理补丁后的类 cloudpickle.dumps(StringDtype) # 应用补丁 patch_string_dtype() def process_features(df, expr_list): """处理特征列,将_NAL1结尾的值替换为NaN""" for col in expr_list: df[col] = df[col].mask(df[col].astype(str).str.endswith('_NAL1'), np.nan) return df # 创建集群连接 cluster = YarnCluster( environment='/home/finance/App/jupyter-ide-tech.msxf.lo/.IDE/work/user_data/dask/dask-yarn-py311-env.tar.gz', # 打包的环境 # dashboard_address=':8042', worker_vcores=4, worker_memory='12GiB', scheduler_memory='12GiB', scheduler_vcores=1, n_workers=2, deploy_mode='remote', # 关键:连接到现有集群 name='dask-on-yarn-py11-4000', queue='root.dask.poc', worker_env={ "DASK_DISTRIBUTED__LOGGING__DISTRIBUTED": "INFO", "DASK_DISTRIBUTED__LOGGING__BOKEH": "INFO", "DASK_DISTRIBUTED__LOGGING__TUPL": "INFO", "DASK_DISTRIBUTED__ADMIN__LOG_FORMAT": "%(asctime)s %(name)s %(levelname)s %(message)s", "LD_LIBRARY_PATH": "/usr/hdp/current/hadoop-client/lib/native", "HADOOP_HOME": "/usr/hdp/current/hadoop-client", "ARROW_LIBHDFS_DIR": "/usr/hdp/current/hadoop-client/lib/native", 'CLASSPATH': '/opt/jupyter/3.1.5.0-152/hadoop/lib/*:/opt/jupyter/3.1.5.0-152/hadoop/.//*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/./:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-hdfs/.//*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-mapreduce/.//*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/./:/opt/jupyter/3.1.5.0-152/hadoop-yarn/lib/*:/opt/jupyter/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/lib/*:/usr/hdp/3.1.5.0-152/hadoop/.//*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/./:/usr/hdp/3.1.5.0-152/hadoop-hdfs/lib/*:/usr/hdp/3.1.5.0-152/hadoop-hdfs/.//*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/lib/*:/usr/hdp/3.1.5.0-152/hadoop-mapreduce/.//*:/usr/hdp/3.1.5.0-152/hadoop-yarn/./:/usr/hdp/3.1.5.0-152/hadoop-yarn/lib/*:/usr/hdp/3.1.5.0-152/hadoop-yarn/.//*:/usr/hdp/3.1.5.0-152/hadoop/conf:/home/finance/App/jupyter-ide-tech.msxf.lo/.IDE/work/user_data/train_sdk/test/dask/dask_hadoop_conf/', "HADOOP_CONF_DIR": "/usr/hdp/current/hadoop-client/etc/hadoop/conf", "ARROW_DISABLE_MEMORY_POOL": "false", "PYARROW_JEMALLOC": "true", "OMP_NUM_THREADS": "1", # 禁用OpenMP并行 "MKL_NUM_THREADS": "1" # 禁用MKL并行 # "MALLOC_ARENA_MAX": "2" # 限制内存分配区域 } ) # 扩展集群 # cluster.scale(20) # 扩展到20个worker # 连接客户端 client = Client(cluster) #client.run(patch_string_dtype) client.get_versions(check=True) client.run_on_scheduler(set_scheduler_env) def process_parquet(input_path, output_path): # 读取Parquet文件 df = dd.read_parquet(input_path) # 数据处理示例:筛选非空记录 processed_df = df.dropna() # 写入新Parquet文件 processed_df.to_parquet(output_path) return f"Processed {input_path} to {output_path}" # 加载数据集 try: # 3. 读取HDFS Parquet文件 hdfs_path = "hdfs://ms-n-dlake/user/yonglang.liu/tmp/5-fold/data/part-01499-ea197b0d-57cd-48a6-ba39-382342983409-c000.snappy.parquet" ddf = dd.read_parquet(hdfs_path, engine='pyarrow', storage_options={"use_ssl": False}, blocksize='256MB') # 7. 保存处理结果 output_path = "hdfs://ms-n-dlake/user/junyao.yao/tmp/5-fold/data/part-01499-ea197b0d-57cd-48a6-ba39-382342983409-c002.snappy.parquet" ddf.to_parquet(output_path,engine='pyarrow') finally: print("结束") #client.close() #cluster.shutdown() 这个代码错,请分析调整下这个代码
最新发布
08-28
### 问题分析 在使用 Dask 与 YARN 集成处理 HDFS 上的 Parquet 文件时,出现 `OSError: [Errno 255]` 错误,通常表示在尝试访问 HDFS 文件系统时遇到了底层通信或认证问题。该错误可能由多种原因引起,包括 Hadoop 配置缺失、Kerberos 认证失败、文件路径权限不足或 PyArrow 与 HDFS 交互时的序列化异常。 根据 Dask 的使用方式,当处理超大数据集时,可以借助其分布式计算能力将任务分配到多个节点上执行,以缓解内存压力[^1]。然而,当涉及 HDFS 写入操作时,必须确保底层文件系统接口(如 `pyarrow.fs` 或 `hdfs3`)能够正确识别并连接到 HDFS 集群。 ### 配置调整建议 #### 1. 确保 Hadoop 配置文件可用 Dask 依赖于 Hadoop 配置文件(如 `core-site.xml` 和 `hdfs-site.xml`)来正确连接 HDFS。这些文件应放置在运行任务的节点上,并可通过环境变量 `HADOOP_CONF_DIR` 或 `ARROW_HDFS_CONF` 指定路径。 ```python import os os.environ['HADOOP_CONF_DIR'] = '/path/to/hadoop/conf' ``` #### 2. 使用 Kerberos 认证(如适用) 如果 HDFS 启用了 Kerberos 安全认证,需在连接时提供 Kerberos principal 和 keytab 文件路径: ```python from dask.distributed import Client client = Client(n_workers=4, memory_limit='32GB') import pyarrow as pa fs = pa.fs.HadoopFileSystem(host='namenode', port=8020, driver='libhdfs3', user='your_kerberos_principal', kerberos_ticket_cache_path='/tmp/krb5cc_1234') ``` #### 3. 设置 Dask Worker 内存限制 为避免内存溢出问题,应在初始化 Dask Client 时设置合理的内存限制: ```python from dask.distributed import Client client = Client(memory_limit='32GB') # 每个 worker 的内存限制 [^3] ``` #### 4. 使用 PyArrow 引擎写入 Parquet 文件 为确保数据写入过程的稳定性,建议显式指定 `engine='pyarrow'` 并避免使用默认引擎,以减少序列化兼容性问题: ```python import dask.dataframe as dd ddf = dd.from_pandas(df, npartitions=2) ddf.to_parquet('hdfs://namenode/path/to/output', engine='pyarrow') ``` #### 5. 验证写入路径权限 确保运行任务的用户具有对目标 HDFS 路径的写入权限。可以通过 HDFS CLI 检查并修改权限: ```bash hadoop fs -chmod -R 777 /user/junyao.yao/output_path ``` #### 6. 使用 PyArrow 直接写入 若问题仍与 Dask 抽象层相关,可尝试使用 PyArrow 的底层 API 直接写入 Parquet 文件,以绕过 Dask 的调度机制: ```python import pyarrow.parquet as pq import pyarrow as pa table = pa.Table.from_pandas(df) pq.write_table(table, 'hdfs://namenode/path/to/output/file.parquet') ``` --- ### 代码修复示例 结合上述配置调整,以下是一个完整的代码示例,用于在 Dask 与 YARN 集成环境下正确写入 HDFS 上的 Parquet 文件: ```python import os from dask.distributed import Client import dask.dataframe as dd import pyarrow.parquet as pq import pandas as pd # 设置 Hadoop 配置路径 os.environ['HADOOP_CONF_DIR'] = '/path/to/hadoop/conf' # 初始化 Dask Client 并设置 worker 内存限制 client = Client(memory_limit='32GB') # 创建示例 DataFrame df = pd.DataFrame({'id': range(100000), 'value': ['test'] * 100000}) # 转换为 Dask DataFrame ddf = dd.from_pandas(df, npartitions=4) # 显式使用 PyArrow 引擎写入 HDFS 上的 Parquet 文件 output_path = "hdfs://namenode/path/to/output" ddf.to_parquet(output_path, engine='pyarrow') ``` --- ###
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值