windows下eclipse项目KPIUrlViewerCounter操作hadoop2.8.3 mapreduce(4)

最新推荐文章于 2026-01-06 12:41:16 发布

原创最新推荐文章于 2026-01-06 12:41:16 发布 · 415 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#KPIUrlViewerCounter #kpi #hadoop

hadoop 专栏收录该内容

4 篇文章

订阅专栏

本文详细介绍了在Windows环境下使用Eclipse进行KPIUrlViewerCounter项目的Hadoop MapReduce操作。首先提到了hadoop环境的搭建，数据准备来源于指定链接，并要求将数据上传到HDFS的特定路径。接着，文章描述了如何根据数据样例设计相关类，进行数据解析与过滤，并计算页面访问量。最后，阐述了运行该项目所需的知识和查看结果的方法。

hadoop环境搭建详情见hadoop系列第一篇与第三篇博客(hadoop配置直接影响到本程序的运行)

数据准备(https://download.youkuaiyun.com/download/elmo66/10636257)：
[hadoop@yourname ~]$ hadoop dfs -mkdir /UrlViewerCounter
[hadoop@yourname ~]$ hadoop dfs -mkdir /UrlViewerCounter/input
[hadoop@yourname ~]$ hadoop dfs -copyFromLocal access.log.10 /UrlViewerCounter/input/
yourname详见hadoop系列第一篇博客；hadoop是登录linux系统的用户名；～指/home/hadoop目录；test.txt是在/home/hadoop目录下，上传到hdfs中/UrlViewerCounter/input/目录下

数据样例：

60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] "GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"

根据样例设计类：

public class KPI {
    private String clientAddr;// 记录客户端的ip地址
    private String clientName;// 记录客户端用户名称,忽略属性"-"
    private String clientRequestTime;// 记录访问时间与时区
    
    private String clientRequestMethod;// 记录请求的方式
    private String clientRequestUrl;// 记录请求的url
    private String clientRequestProtocol;// 记录请求的http协议
    
    private String responseStatus;// 记录请求状态；成功是200
    private String responseBytes;// 记录发送给客户端文件主体内容大小
    private String urlReferer;// 用来记录从那个页面链接访问过来的
    private String httpAgent;// 记录客户浏览器的相关信息

    private boolean valid = true;// 判断数据是否合法

    public String getClientAddr() {
		return clientAddr;
	}
	public void setClientAddr(String clientAddr) {
		this.clientAddr = clientAddr;
	}
	public String getClientName() {
		return clientName;
	}
	public void setClientName(String clientName) {
		this.clientName = clientName;
	}
	public String getClientRequestTime() {
		return clientRequestTime;
	}
	public void setClientRequestTime(String clientRequestTime) {
		this.clientRequestTime = clientRequestTime;
	}
	public String getClientRequestMethod() {
		return clientRequestMethod;
	}
	public void setClientRequestMethod(String clientRequestMethod) {
		this.clientRequestMethod = clientRequestMethod;
	}
	public String getClientRequestUrl() {
		return clientRequestUrl;
	}
	public void setClientRequestUrl(String clientRequestUrl) {
		this.clientRequestUrl = clientRequestUrl;
	}
	public String getClientRequestProtocol() {
		return clientRequestProtocol;
	}
	public void setClientRequestProtocol(String clientRequestProtocol) {
		this.clientRequestProtocol = clientRequestProtocol;
	}
	public String getResponseStatus() {
		return responseStatus;
	}
	public void setResponseStatus(String responseStatus) {
		this.responseStatus = responseStatus;
	}
	public String getResponseBytes() {
		return responseBytes;
	}
	public void setResponseBytes(String responseBytes) {
		this.responseBytes = responseBytes;
	}
	public String getUrlReferer() {
		return urlReferer;
	}
	public void setUrlReferer(String urlReferer) {
		this.urlReferer = urlReferer;
	}
	public String getHttpAgent() {
		return httpAgent;
	}
	public void setHttpAgent(String httpAgent) {
		this.httpAgent = httpAgent;
	}
	public boolean isValid() {
		return valid;
	}
	public void setValid(boolean valid) {
		this.valid = valid;
	}
	
	@Override
	public String toString() {
		return "KPI [clientAddr=" + clientAddr + ", clientName=" + clientName + ", clientRequestTime="
				+ clientRequestTime + ", clientRequestMethod=" + clientRequestMethod + ", clientRequestUrl="
				+ clientRequestUrl + ", clientRequestProtocol=" + clientRequestProtocol + ", responseStatus="
				+ responseStatus + ", responseBytes=" + responseBytes + ", urlReferer=" + urlReferer + ", httpAgent="
				+ httpAgent + ", valid=" + valid + "]";
	}
}

数据解析与过滤：

public class KPIUtils {
	
	public static KPI kpiParse(String line) {
        System.out.println(line);
        KPI kpi = new KPI();
        String[] arr = line.split(" ");
        if (arr.length >= 12) {
	        kpi.setClientAddr(arr[0]);
	        kpi.setClientName(arr[1]);
	        kpi.setClientRequestTime(arr[3].replace("[", ""));
	        kpi.setClientRequestMethod(arr[5].replace("\"", ""));
	        kpi.setClientRequestUrl(arr[6]);
	        kpi.setClientRequestProtocol(arr[7].replace("\"", ""));
	        kpi.setResponseStatus(arr[8]);
	        kpi.setResponseBytes(arr[9]);
	        kpi.setUrlReferer(arr[10]);
	        kpi.setHttpAgent(arr[11]);
            
            kpi.setValid(true);
        } else {
            kpi.setValid(false);
        }
        return kpi;
    }

	public static KPI urlFilter(String line) {
        KPI kpi = kpiParse(line);
        
        Set<String> urls = new HashSet<String>();
        //数据量大的情况下，list的contaions效率慢
        //List<String> urls = new ArrayList<String>();
        urls.add("/about");
        urls.add("/black-ip-list/");
        urls.add("/cassandra-clustor/");
        urls.add("/finance-rhive-repurchase/");
        urls.add("/hadoop-family-roadmap/");
        urls.add("/hadoop-hive-intro/");
        urls.add("/hadoop-zookeeper-intro/");
        urls.add("/hadoop-mahout-roadmap/");

        if (!urls.contains(kpi.getClientRequestUrl())) {
            kpi.setValid(false);
        }
        return kpi;
    }

}

页面(url)访问量：

package com.hadoop.kpi;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class KPIUrlViewerCounter { 

    public static class KPIUrlViewerMapper extends Mapper<Object, Text, Text, IntWritable> {
        private IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
        		throws IOException, InterruptedException {
        	KPI kpi = KPIUtils.urlFilter(value.toString());
            if (kpi.isValid()) {
                word.set(kpi.getClientRequestUrl());
                context.write(word, one);
            }
        }
    }

    public static class KPIUrlViewerReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
        		Reducer<Text, IntWritable, Text, IntWritable>.Context output) throws IOException, InterruptedException {
        	int sum = 0;
        	for(IntWritable value : values){
        		sum += value.get();
        	}
            result.set(sum);
            output.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
    	String input = "hdfs://192.168.1.101:9000/UrlViewerCounter/input";
        String output = "hdfs://192.168.1.101:9000/UrlViewerCounter/output";
        
        Configuration conf = new Configuration();
        //配置信息不可缺少
        conf.set("mapreduce.framework.name","yarn");
        conf.set("yarn.resourcemanager.hostname","192.168.1.101");
        conf.set("fs.defaultFS","hdfs://192.168.1.101:9000/");
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.jobhistory.address", "192.168.1.101:10020");
        
        Job job = Job.getInstance(conf);
        job.setJarByClass(KPIUrlViewerCounter.class);
        //先打包再运行
        job.setJar("E:/KPIUrlViewerCounter.jar");
        job.setJobName("KPIUrlViewerCounter");
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
 
        job.setMapperClass(KPIUrlViewerMapper.class);
        job.setCombinerClass(KPIUrlViewerReducer.class);
        job.setReducerClass(KPIUrlViewerReducer.class);
 
        FileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));
 
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}

运行必备知识、查看结果方式详见上一篇博客

运行结果(对错可以对比原数据)：

/about	5
/black-ip-list/	2
/cassandra-clustor/	3
/finance-rhive-repurchase/	13
/hadoop-family-roadmap/	13
/hadoop-hive-intro/	14
/hadoop-mahout-roadmap/	20
/hadoop-zookeeper-intro/	6