Hadoop-13

日志清洗(数据清洗)

简单解析版
1)需求:
去除日志中字段长度小于等于11的日志。
2)输入数据
在这里插入图片描述
3)实现代码:
(1)编写LogMapper

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WebLogMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		// 1 获取一行
		String line = value.toString();
		
		// 2 解析日志的方法
		boolean result = parseLog(line, context);
			
		// 3 判断是否合法
		if (!result) {
			return;
		}
		
		// 4合法的日志写出去
		context.write(value, NullWritable.get());
	}

	private boolean parseLog(String line, Context context) {
		// 1 截取
		String[] fields = line.split(" ");
		
		// 2 判断字段长度是否大于11
		if (fields.length > 11) {// 认为是合法的
			// 3 记录合法次数
			context.getCounter("map", "true").increment(1);
			
			return true;
		}else {// 认为是非法的
			// 4 记录不合法的次数
			
			context.getCounter("map", "false").increment(1);
			return false;
		}
	}
}

(2)编写LogDriver

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WebLogDriver {

	public static void main(String[] args) throws Exception {
		// 1 获取job信息
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		// 2 加载jar包
		job.setJarByClass(WebLogDriver.class);

		// 3 关联map
		job.setMapperClass(WebLogMapper.class);

		// 4 设置最终输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		// 5 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 6 提交
		job.waitForCompletion(true);
	}
}

复杂解析版
1)需求:
对web访问日志中的各字段识别切分
去除日志中不合法的记录
根据统计需求,生成各类访问请求过滤数据

2)输入数据
同上

3)实现代码:
(1)定义一个bean,用来记录日志数据中的各数据字段


public class LogBean {
	private String remote_addr;// 记录客户端的ip地址
	private String remote_user;// 记录客户端用户名称,忽略属性"-"
	private String time_local;// 记录访问时间与时区
	private String request;// 记录请求的url与http协议
	private String status;// 记录请求状态;成功是200
	private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
	private String http_referer;// 用来记录从那个页面链接访问过来的
	private String http_user_agent;// 记录客户浏览器的相关信息

	private boolean valid = true;// 判断数据是否合法

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		return http_referer;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(this.valid);
		sb.append("\001").append(this.remote_addr);
		sb.append("\001").append(this.remote_user);
		sb.append("\001").append(this.time_local);
		sb.append("\001").append(this.request);
		sb.append("\001").append(this.status);
		sb.append("\001").append(this.body_bytes_sent);
		sb.append("\001").append(this.http_referer);
		sb.append("\001").append(this.http_user_agent);
		
		return sb.toString();
	}
}

(2)编写LogMapper程序

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
	Text k = new Text();
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		// 1 获取1行
		String line = value.toString();
		
		// 2 解析日志是否合法
		LogBean bean = pressLog(line);
		
		if (!bean.isValid()) {
			return;
		}
		
		k.set(bean.toString());
		
		// 3 输出
		context.write(k, NullWritable.get());
	}

	// 解析日志
	private LogBean pressLog(String line) {
		LogBean logBean = new LogBean();
		
		// 1 截取
		String[] fields = line.split(" ");
		
		if (fields.length > 11) {
			// 2封装数据
			logBean.setRemote_addr(fields[0]);
			logBean.setRemote_user(fields[1]);
			logBean.setTime_local(fields[3].substring(1));
			logBean.setRequest(fields[6]);
			logBean.setStatus(fields[8]);
			logBean.setBody_bytes_sent(fields[9]);
			logBean.setHttp_referer(fields[10]);
			
			if (fields.length > 12) {
				logBean.setHttp_user_agent(fields[11] + " "+ fields[12]);
			}else {
				logBean.setHttp_user_agent(fields[11]);
			}
			
			// 大于400,HTTP错误
			if (Integer.parseInt(logBean.getStatus()) >= 400) {
				logBean.setValid(false);
			}
		}else {
			logBean.setValid(false);
		}
		
		return logBean;
	}
}

(3)编写LogDriver程序

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogDriver {
	public static void main(String[] args) throws Exception {
		// 1 获取job信息
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		// 2 加载jar包
		job.setJarByClass(LogDriver.class);

		// 3 关联map
		job.setMapperClass(LogMapper.class);

		// 4 设置最终输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		// 5 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 6 提交
		job.waitForCompletion(true);
	}
}
DEPRECATED: Use of this script to execute hdfs command is deprecated. Instead use the hdfs command for it. 2025-06-18 16:50:39,734 INFO datanode.DataNode: STARTUP_MSG: /************************************************************ STARTUP_MSG: Starting DataNode STARTUP_MSG: host = LAPTOP-FK5QKFGQ/192.168.10.1 STARTUP_MSG: args = [] STARTUP_MSG: version = 3.2.2 STARTUP_MSG: classpath = D:\pyspark\Hadoop\hadoop-3.2.2\etc\hadoop;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\accessors-smart-1.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\animal-sniffer-annotations-1.17.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\asm-5.0.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\audience-annotations-0.5.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\avro-1.7.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\checker-qual-2.5.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-beanutils-1.9.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-cli-1.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-codec-1.11.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-collections-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-compress-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-configuration2-2.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-io-2.5.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-lang3-3.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-logging-1.1.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-math3-3.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-net-3.6.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\commons-text-1.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\curator-client-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\curator-framework-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\curator-recipes-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\dnsjava-2.1.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\error_prone_annotations-2.2.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\failureaccess-1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\gson-2.2.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\guava-27.0-jre.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\hadoop-annotations-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\hadoop-auth-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\htrace-core4-4.1.0-incubating.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\httpclient-4.5.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\httpcore-4.4.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\j2objc-annotations-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-annotations-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-core-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-core-asl-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-databind-2.9.10.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-jaxrs-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-mapper-asl-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jackson-xc-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\javax.activation-api-1.2.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\javax.servlet-api-3.1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jaxb-api-2.2.11.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jaxb-impl-2.2.3-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jcip-annotations-1.0-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jersey-core-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jersey-json-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jersey-server-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jersey-servlet-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jettison-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-http-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-io-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-security-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-server-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-servlet-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-util-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-webapp-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jetty-xml-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jsch-0.1.55.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\json-smart-2.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jsp-api-2.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jsr305-3.0.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jsr311-api-1.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\jul-to-slf4j-1.7.25.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-admin-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-client-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-common-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-core-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-crypto-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-identity-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-server-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-simplekdc-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerb-util-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerby-asn1-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerby-config-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerby-pkix-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerby-util-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\kerby-xdr-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\log4j-1.2.17.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\metrics-core-3.2.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\netty-3.10.6.Final.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\nimbus-jose-jwt-7.9.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\paranamer-2.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\protobuf-java-2.5.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\re2j-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\slf4j-api-1.7.25.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\slf4j-log4j12-1.7.25.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\snappy-java-1.0.5.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\stax2-api-3.1.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\token-provider-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\woodstox-core-5.0.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\lib\zookeeper-3.4.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\hadoop-common-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\hadoop-common-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\hadoop-kms-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\common\hadoop-nfs-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\accessors-smart-1.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\animal-sniffer-annotations-1.17.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\asm-5.0.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\audience-annotations-0.5.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\avro-1.7.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\checker-qual-2.5.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-beanutils-1.9.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-cli-1.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-codec-1.11.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-collections-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-compress-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-configuration2-2.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-daemon-1.0.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-io-2.5.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-lang3-3.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-logging-1.1.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-math3-3.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-net-3.6.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\commons-text-1.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\curator-client-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\curator-framework-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\curator-recipes-2.13.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\dnsjava-2.1.7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\error_prone_annotations-2.2.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\failureaccess-1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\gson-2.2.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\guava-27.0-jre.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\hadoop-annotations-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\hadoop-auth-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\htrace-core4-4.1.0-incubating.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\httpclient-4.5.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\httpcore-4.4.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\j2objc-annotations-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-annotations-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-core-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-core-asl-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-databind-2.9.10.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-jaxrs-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-mapper-asl-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jackson-xc-1.9.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\javax.activation-api-1.2.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\javax.servlet-api-3.1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jaxb-api-2.2.11.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jaxb-impl-2.2.3-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jcip-annotations-1.0-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jersey-core-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jersey-json-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jersey-server-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jersey-servlet-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jettison-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-http-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-io-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-security-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-server-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-servlet-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-util-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-util-ajax-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-webapp-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jetty-xml-9.4.20.v20190813.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jsch-0.1.55.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\json-simple-1.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\json-smart-2.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jsr305-3.0.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\jsr311-api-1.1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-admin-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-client-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-common-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-core-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-crypto-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-identity-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-server-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-simplekdc-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerb-util-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerby-asn1-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerby-config-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerby-pkix-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerby-util-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\kerby-xdr-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\leveldbjni-all-1.8.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\log4j-1.2.17.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\netty-3.10.6.Final.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\netty-all-4.1.48.Final.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\nimbus-jose-jwt-7.9.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\okhttp-2.7.5.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\okio-1.6.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\paranamer-2.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\protobuf-java-2.5.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\re2j-1.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\snappy-java-1.0.5.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\stax2-api-3.1.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\token-provider-1.0.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\woodstox-core-5.0.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\lib\zookeeper-3.4.13.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-client-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-client-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-httpfs-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-native-client-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-native-client-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-nfs-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-rbf-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\hdfs\hadoop-hdfs-rbf-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\aopalliance-1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\bcpkix-jdk15on-1.60.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\bcprov-jdk15on-1.60.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\ehcache-3.3.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\fst-2.50.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\geronimo-jcache_1.0_spec-1.0-alpha-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\guice-4.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\guice-servlet-4.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\HikariCP-java7-2.4.12.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\jackson-jaxrs-base-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\jackson-jaxrs-json-provider-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\jackson-module-jaxb-annotations-2.9.10.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\java-util-1.9.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\javax.inject-1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\jersey-client-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\jersey-guice-1.19.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\json-io-2.5.1.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\metrics-core-3.2.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\mssql-jdbc-6.2.1.jre7.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\objenesis-1.0.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\snakeyaml-1.16.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\lib\swagger-annotations-1.5.4.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-api-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-applications-distributedshell-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-applications-unmanaged-am-launcher-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-client-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-common-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-registry-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-applicationhistoryservice-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-common-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-nodemanager-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-resourcemanager-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-router-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-sharedcachemanager-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-tests-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-timeline-pluginstorage-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-server-web-proxy-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-services-api-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-services-core-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\yarn\hadoop-yarn-submarine-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\lib\hamcrest-core-1.3.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\lib\junit-4.11.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-app-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-common-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-core-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-hs-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-hs-plugins-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-jobclient-3.2.2-tests.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-jobclient-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-nativetask-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-shuffle-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-client-uploader-3.2.2.jar;D:\pyspark\Hadoop\hadoop-3.2.2\share\hadoop\mapreduce\hadoop-mapreduce-examples-3.2.2.jar STARTUP_MSG: build = Unknown -r 7a3bc90b05f257c8ace2f76d74264906f0f7a932; compiled by 'hexiaoqiao' on 2021-01-03T09:26Z STARTUP_MSG: java = 1.8.0_281 ************************************************************/ 2025-06-18 16:50:45,335 INFO checker.ThrottledAsyncChecker: Scheduling a check for [DISK]file:/D:/hadoop-3.2.2/data/datanode 2025-06-18 16:50:45,420 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties 2025-06-18 16:50:45,483 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s). 2025-06-18 16:50:45,484 INFO impl.MetricsSystemImpl: DataNode metrics system started 2025-06-18 16:50:46,677 INFO common.Util: dfs.datanode.fileio.profiling.sampling.percentage set to 0. Disabling file IO profiling 2025-06-18 16:50:46,689 INFO datanode.BlockScanner: Initialized block scanner with targetBytesPerSec 1048576 2025-06-18 16:50:46,692 INFO datanode.DataNode: Configured hostname is LAPTOP-FK5QKFGQ 2025-06-18 16:50:46,693 INFO common.Util: dfs.datanode.fileio.profiling.sampling.percentage set to 0. Disabling file IO profiling 2025-06-18 16:50:46,695 INFO datanode.DataNode: Starting DataNode with maxLockedMemory = 0 2025-06-18 16:50:46,709 INFO datanode.DataNode: Opened streaming server at /0.0.0.0:9866 2025-06-18 16:50:46,710 INFO datanode.DataNode: Balancing bandwidth is 10485760 bytes/s 2025-06-18 16:50:46,710 INFO datanode.DataNode: Number threads for balancing is 50 2025-06-18 16:50:46,741 INFO util.log: Logging initialized @7589ms to org.eclipse.jetty.util.log.Slf4jLog 2025-06-18 16:50:51,787 INFO server.AuthenticationFilter: Unable to initialize FileSignerSecretProvider, falling back to use random secrets. 2025-06-18 16:50:51,821 INFO http.HttpRequestLog: Http request log for http.requests.datanode is not defined 2025-06-18 16:50:51,828 INFO http.HttpServer2: Added global filter 'safety' (class=org.apache.hadoop.http.HttpServer2$QuotingInputFilter) 2025-06-18 16:50:51,829 INFO http.HttpServer2: Added filter static_user_filter (class=org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter) to context datanode 2025-06-18 16:50:51,829 INFO http.HttpServer2: Added filter static_user_filter (class=org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter) to context logs 2025-06-18 16:50:51,830 INFO http.HttpServer2: Added filter static_user_filter (class=org.apache.hadoop.http.lib.StaticUserWebFilter$StaticUserFilter) to context static 2025-06-18 16:50:51,848 INFO http.HttpServer2: Jetty bound to port 38751 2025-06-18 16:50:51,849 INFO server.Server: jetty-9.4.20.v20190813; built: 2019-08-13T21:28:18.144Z; git: 84700530e645e812b336747464d6fbbf370c9a20; jvm 1.8.0_281-b09 2025-06-18 16:50:51,865 INFO server.session: DefaultSessionIdManager workerName=node0 2025-06-18 16:50:51,865 INFO server.session: No SessionScavenger set, using defaults 2025-06-18 16:50:51,867 INFO server.session: node0 Scavenging every 660000ms 2025-06-18 16:50:51,874 INFO handler.ContextHandler: Started o.e.j.s.ServletContextHandler@2421cc4{logs,/logs,file:///D:/pyspark/Hadoop/hadoop-3.2.2/logs/,AVAILABLE} 2025-06-18 16:50:51,874 INFO handler.ContextHandler: Started o.e.j.s.ServletContextHandler@21ba0741{static,/static,file:///D:/pyspark/Hadoop/hadoop-3.2.2/share/hadoop/hdfs/webapps/static/,AVAILABLE} 2025-06-18 16:50:51,926 INFO util.TypeUtil: JVM Runtime does not support Modules 2025-06-18 16:50:51,932 INFO handler.ContextHandler: Started o.e.j.w.WebAppContext@43f82e78{datanode,/,file:///D:/pyspark/Hadoop/hadoop-3.2.2/share/hadoop/hdfs/webapps/datanode/,AVAILABLE}{file:/D:/pyspark/Hadoop/hadoop-3.2.2/share/hadoop/hdfs/webapps/datanode} 2025-06-18 16:50:51,939 INFO server.AbstractConnector: Started ServerConnector@1e097d59{HTTP/1.1,[http/1.1]}{localhost:38751} 2025-06-18 16:50:51,940 INFO server.Server: Started @12789ms 2025-06-18 16:50:52,540 INFO web.DatanodeHttpServer: Listening HTTP traffic on /0.0.0.0:9864 2025-06-18 16:50:52,545 INFO util.JvmPauseMonitor: Starting JVM pause monitor 2025-06-18 16:50:52,545 INFO datanode.DataNode: dnUserName = aaa 2025-06-18 16:50:52,546 INFO datanode.DataNode: supergroup = supergroup 2025-06-18 16:50:52,573 INFO ipc.CallQueueManager: Using callQueue: class java.util.concurrent.LinkedBlockingQueue, queueCapacity: 1000, scheduler: class org.apache.hadoop.ipc.DefaultRpcScheduler, ipcBackoff: false. 2025-06-18 16:50:52,583 INFO ipc.Server: Starting Socket Reader #1 for port 9867 2025-06-18 16:50:52,720 INFO datanode.DataNode: Opened IPC server at /0.0.0.0:9867 2025-06-18 16:50:52,729 INFO datanode.DataNode: Refresh request received for nameservices: null 2025-06-18 16:50:52,735 INFO datanode.DataNode: Starting BPOfferServices for nameservices: <default> 2025-06-18 16:50:52,740 INFO datanode.DataNode: Block pool <registering> (Datanode Uuid unassigned) service to localhost/127.0.0.1:9000 starting to offer service 2025-06-18 16:50:52,745 INFO ipc.Server: IPC Server Responder: starting 2025-06-18 16:50:52,745 INFO ipc.Server: IPC Server listener on 9867: starting 2025-06-18 16:50:52,954 INFO datanode.DataNode: Acknowledging ACTIVE Namenode during handshakeBlock pool <registering> (Datanode Uuid unassigned) service to localhost/127.0.0.1:9000 2025-06-18 16:50:52,956 INFO common.Storage: Using 1 threads to upgrade data directories (dfs.datanode.parallel.volumes.load.threads.num=1, dataDirs=1) 2025-06-18 16:50:52,965 INFO common.Storage: Lock on D:\hadoop-3.2.2\data\datanode\in_use.lock acquired by nodename 16808@LAPTOP-FK5QKFGQ 2025-06-18 16:50:52,970 WARN common.Storage: Failed to add storage directory [DISK]file:/D:/hadoop-3.2.2/data/datanode java.io.IOException: Incompatible clusterIDs in D:\hadoop-3.2.2\data\datanode: namenode clusterID = CID-0243def2-304c-4ffd-871c-57b2cdf0182f; datanode clusterID = CID-a6ff55fc-9daf-4605-8a53-edaae5a9f8de at org.apache.hadoop.hdfs.server.datanode.DataStorage.doTransition(DataStorage.java:744) at org.apache.hadoop.hdfs.server.datanode.DataStorage.loadStorageDirectory(DataStorage.java:294) at org.apache.hadoop.hdfs.server.datanode.DataStorage.loadDataStorage(DataStorage.java:407) at org.apache.hadoop.hdfs.server.datanode.DataStorage.addStorageLocations(DataStorage.java:387) at org.apache.hadoop.hdfs.server.datanode.DataStorage.recoverTransitionRead(DataStorage.java:559) at org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:1748) at org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:1684) at org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:392) at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:282) at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:829) at java.lang.Thread.run(Thread.java:748) 2025-06-18 16:50:52,973 ERROR datanode.DataNode: Initialization failed for Block pool <registering> (Datanode Uuid cd899db0-fd95-4996-8250-261d1d36dbda) service to localhost/127.0.0.1:9000. Exiting. java.io.IOException: All specified directories have failed to load. at org.apache.hadoop.hdfs.server.datanode.DataStorage.recoverTransitionRead(DataStorage.java:560) at org.apache.hadoop.hdfs.server.datanode.DataNode.initStorage(DataNode.java:1748) at org.apache.hadoop.hdfs.server.datanode.DataNode.initBlockPool(DataNode.java:1684) at org.apache.hadoop.hdfs.server.datanode.BPOfferService.verifyAndSetNamespaceInfo(BPOfferService.java:392) at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.connectToNNAndHandshake(BPServiceActor.java:282) at org.apache.hadoop.hdfs.server.datanode.BPServiceActor.run(BPServiceActor.java:829) at java.lang.Thread.run(Thread.java:748) 2025-06-18 16:50:52,973 WARN datanode.DataNode: Ending block pool service for: Block pool <registering> (Datanode Uuid cd899db0-fd95-4996-8250-261d1d36dbda) service to localhost/127.0.0.1:9000 2025-06-18 16:50:52,974 INFO datanode.DataNode: Removed Block pool <registering> (Datanode Uuid cd899db0-fd95-4996-8250-261d1d36dbda) 2025-06-18 16:50:54,974 WARN datanode.DataNode: Exiting Datanode 2025-06-18 16:50:54,976 INFO datanode.DataNode: SHUTDOWN_MSG: /************************************************************ SHUTDOWN_MSG: Shutting down DataNode at LAPTOP-FK5QKFGQ/192.168.10.1 ************************************************************/
06-19
<think>嗯,用户遇到了HadoopDataNode启动失败的问题,具体报错是集群ID不兼容(java.io.IOException:IncompatibleclusterIDs)。从引用内容看,这确实是HDFS部署中的经典问题。用户应该是个刚接触Hadoop的系统管理员或大数据开发者,正在搭建或维护集群。ta直接粘贴了错误关键词,说明已经查看了日志文件,但可能不太清楚clusterID的同步机制。这类问题通常发生在格式化NameNode后未同步DataNode的场景。引用[2]给出了最清晰的诊断路径:通过hdfs-site.xml定位VERSION文件位置。但用户可能忽略了关键点——格式化NameNode会改变其clusterID(引用[4]),而DataNode的ID保持不变。引用[1][3][5]提供的解决方案各有侧重,需要整合:首先必须强调风险点:清空DataNode数据目录(引用[5])会导致数据丢失!应该优先尝试引用[1]的非破坏性方案:手动修改DataNode的clusterID与NameNode一致。不过如果集群没有重要数据,引用[5]的彻底重建方案更彻底。需要提醒用户检查的细节:-所有DataNode节点的VERSION文件都要修改-停止服务后再操作文件-注意NameNode的VERSION文件路径可能包含多个副本(比如HA集群)用户可能还会困惑为什么ID会变。这涉及到HDFS的元数据管理机制:格式化时生成新集群ID,而DataNode首次启动时从NameNode获取该ID并持久化。如果DataNode已有旧ID记录,就会冲突。最后生成的延伸问题覆盖了操作预防、HA场景和日志分析,这些都是用户后续可能遇到的关联问题。特别是HA集群的ID同步更复杂,需要单独说明。</think>### Hadoop DataNode启动失败:集群ID不兼容问题解决指南 #### 问题原因 当出现 `java.io.IOException: Incompatible clusterIDs` 错误时,表明**NameNode与DataNode的集群标识符不匹配**。核心原因是: 1. **NameNode格式化重置ID**:执行 `hdfs namenode -format` 会生成新`clusterID`(引用[4]) 2. **DataNode未同步更新**:DataNode仍保留旧`clusterID`(引用[1][2]) 3. **ID比对失败**:启动时DataNode检测到与NameNode的ID不一致(引用[3]) #### 解决步骤 **方案一:修改DataNode的clusterID(推荐)** ```bash # 1. 停止HDFS服务 stop-dfs.sh # 2. 定位DataNode的VERSION文件 # 路径参考hdfs-site.xml中dfs.datanode.data.dir配置项(引用[2]) cd /opt/module/hadoop-2.7.2/data/tmp/dfs/data/current # 3. 修改VERSION文件 vim VERSION # 将clusterID值改为NameNode的clusterID(引用[1]) # 4. 启动HDFS start-dfs.sh ``` **方案二:彻底重建(适用于新集群/无数据时)** ```bash # 1. 停止所有服务 stop-all.sh # 2. 清空所有DataNode数据目录 rm -rf /opt/module/hadoop-2.7.2/data/tmp/dfs/data/* # 3. 重新格式化NameNode(引用[5]) hdfs namenode -format # 4. 启动集群 start-all.sh ``` #### 关键配置文件位置 | 组件 | 配置文件位置 | 查看方法 | |-----------|---------------------------------------|------------------------------| | NameNode | `dfs.namenode.name.dir/current/VERSION` | 执行 `cat VERSION \| grep clusterID` | | DataNode | `dfs.datanode.data.dir/current/VERSION` | 同上(引用[2]) | > **注意**:修改前务必备份数据!多节点集群需在所有DataNode上同步操作。 #### 预防措施 1. **避免重复格式化**:格式化NameNode会重置集群ID(引用[4]) 2. **维护配置一致性**:确保所有节点`hdfs-site.xml`中路径配置相同 3. **使用脚本同步操作**:批量执行DataNode的ID更新 [^1]: 通过修改DataNode的clusterID解决兼容性问题 [^2]: 通过hdfs-site.xml定位VERSION文件路径 [^4]: NameNode格式化会重置clusterID [^5]: 清空DataNode目录后重建集群
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值