项目需求
&统计页面的浏览量
&统计各个省份的浏览量
&统计页面的访问量
一.数据处理流程及技术架构
1.统计页面的浏览量
select count(1) from xxx,
count 把每一行作出一个固定的KEY,value的赋值为1 project mr PVStatApp
2.统计各个省份的浏览量
select province count(1) from xxx group by procince
地市信息我们是可以通过IP解析得到的《==ip如何转换成城市信息
3.统计页面的访问量:把符合规则的pageId获取到,然后进行统计即可。
数据准备(部分展示):
20946835322http://www.yihaodian.com/1/?tracker_u=2225501&type=3http://www.baidu.com/s?wd=1%E5%8F%B7%E5%BA%97&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=5&rsv_sug=0&rsv_sug1=4&rsv_sug4=313&inputT=42351号店1SKAPHD3JZYH9EE9ACB1NGA9VDQHNJMX1NY9TPPG4SWG71358HGRJGQHQQBXY9GF96CVU2225501\N124.79.172.232msessionid:YR9H5YU7RZ8Y94EBJNZ2P5W8DT37Q9JH,unionKey:22255012013-07-21 09:30:01\Nhttp://www.baidu.com/s?wd=1%E5%8F%B7%E5%BA%97&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=5&rsv_sug=0&rsv_sug1=4&rsv_sug4=313&inputT=42351\Nnull-10Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MATP; Media Center PC 6.0; .NET4.0C; InfoPath.2; .NET4.0E)Win32上海市12013-07-21 09:30:01上海市66\N\N\N\N2013-07-21
20947290187http://www.yihaodian.com/ctg/s2/c5287-%E5%A4%A7%E7%B1%B3/b/a28186,28184-s1-v0-p1-price-d0-f0-m1-rt0-pid-k/1/http://www.yihaodian.com/ctg/s2/c5287-%E5%A4%A7%E7%B1%B3/b1879/a281862SKAPHD3JZYH9EE9ACB1NGA9VDQHNJMX1NY9TPPG4SWG71358HGRJGQHQQBXY9GF96CVU22255011124.79.172.232msessionid:YR9H5YU7RZ8Y94EBJNZ2P5W8DT37Q9JH,uname:chen45311,unionKey:22255012013-07-21 09:32:30134885852http://www.baidu.com/s?wd=1%E5%8F%B7%E5%BA%97&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=5&rsv_sug=0&rsv_sug1=4&rsv_sug4=313&inputT=4235110\N1-10Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MATP; Media Center PC 6.0; .NET4.0C; InfoPath.2; .NET4.0E)Win32 93 1search_order_2上海市1null上海市66\N\N\N\N2013-07-21
20947323490http://www.yihaodian.com/cmsPage/show.do?pageId=49248&provinceId=1http://www.yihaodian.com/cmsPage/show.do?pageId=48963&provinceId=13SKAPHD3JZYH9EE9ACB1NGA9VDQHNJMX1NY9TPPG4SWG71358HGRJGQHQQBXY9GF96CVU2225501\N124.79.172.232msessionid:YR9H5YU7RZ8Y94EBJNZ2P5W8DT37Q9JH,uname:chen45311,unionKey:22255012013-07-21 09:37:04134885852http://www.baidu.com/s?wd=1%E5%8F%B7%E5%BA%97&rsv_bp=0&ch=&tn=baidu&bar=&rsv_spt=3&ie=utf-8&rsv_sug3=5&rsv_sug=0&rsv_sug1=4&rsv_sug4=313&inputT=423520\N1-10Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MATP; Media Center PC 6.0; .NET4.0C; InfoPath.2; .NET4.0E)Win32cms_pic_48963_542760_0上海市1上海市66\N\N\N\N2013-07-21
1.统计页面的浏览量
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 浏览量的统计
*/
public class PVStatApp {
public static void main(String[] args) throws Exception{
// driver类 八股文
Configuration configuration =new Configuration();
FileSystem fileSystem=FileSystem.get(configuration);
Path outputPath=new Path(args[1]);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
Job job =Job.getInstance(configuration);
job.setJarByClass(PVStatApp.class);
job.setMapperClass(Mymapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
//Map
static class Mymapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text KEY=new Text("key");
private LongWritable ONE=new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(KEY,ONE);
}
}
//Reduce
static class MyReducer extends Reducer<Text,LongWritable, NullWritable,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count =0;
for(LongWritable value :values){
count++;
}
context.write(NullWritable.get(),new LongWritable(count));
}
}
}
2.统计各个省份的浏览量
select province count(1) from xxx group by procince
地市信息我们是可以通过IP解析得到的《==ip如何转换成城市信息
省份可以由ip来解析的需要ip解析类,ip解析测试:
IP解析类:
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
public class IPSeeker {
public static final String ERROR_RESULT = "错误的IP数据库文件";
// 一些固定常量,比如记录长度等等
private static final int IP_RECORD_LENGTH = 7;
private static final byte AREA_FOLLOWED = 0x01;
private static final byte NO_AREA = 0x2;
// 用来做为cache,查询一个ip时首先查看cache,以减少不必要的重复查找
private Hashtable ipCache;
// 随机文件访问类
private RandomAccessFile ipFile;
// 内存映射文件
private MappedByteBuffer mbb;
// 单一模式实例
private static IPSeeker instance = null;
// 起始地区的开始和结束的绝对偏移
private long ipBegin, ipEnd;
// 为提高效率而采用的临时变量
private IPLocation loc;
private byte[] buf;
private byte[] b4;
private byte[] b3;
/** */
/**
* 私有构造函数
*/
protected IPSeeker(String ipFilePath) {
ipCache = new Hashtable();
loc = new IPLocation();
buf = new byte[100];
b4 = new byte[4];
b3 = new byte[3];
try {
ipFile = new RandomAccessFile(ipFilePath, "r");
} catch (FileNotFoundException e) {
System.out.println("IP地址信息文件没有找到,IP显示功能将无法使用");
ipFile = null;
}
// 如果打开文件成功,读取文件头信息
if (ipFile != null) {
try {
ipBegin = readLong4(0);
ipEnd = readLong4(4);
if (ipBegin == -1 || ipEnd == -1) {
ipFile.close();
ipFile = null;
}
} catch (IOException e) {
System.out.println("IP地址信息文件格式有错误,IP显示功能将无法使用");
ipFile = null;
}
}
}
/** */
/**
* @return 单一实例
*/
public static IPSeeker getInstance(String ipFilePath) {
if (instance == null) {
instance = new IPSeeker(ipFilePath);
}
return instance;
}
/** */
/**
* 给定一个地点的不完全名字,得到一系列包含s子串的IP范围记录
*
* @param s
* 地点子串
* @return 包含IPEntry类型的List
*/
public List getIPEntriesDebug(String s) {
List ret = new ArrayList();
long endOffset = ipEnd + 4;
for (long offset = ipBegin + 4; offset <= endOffset; offset += IP_RECORD_LENGTH) {
// 读取结束IP偏移
long temp = readLong3(offset);
// 如果temp不等于-1,读取IP的地点信息
if (temp != -1) {
IPLocation loc = getIPLocation(temp);
// 判断是否这个地点里面包含了s子串,如果包含了,添加这个记录到List中,如果没有,继续
if (loc.country.indexOf(s) != -1 || loc.area.indexOf(s) != -1) {
IPEntry entry = new IPEntry();
entry.country = loc.country;
entry.area = loc.area;
// 得到起始IP
readIP(offset - 4, b4);
entry.beginIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 得到结束IP
readIP(temp, b4);
entry.endIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 添加该记录
ret.add(entry