电商日志分析2
提供了四个工具类
1.Getpaged
2.Ipparser
3.Ipseeker
4Losparser
Getpaged代码
package com.task.ds.utils;
import org.apache.commons.lang.StringUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetPageId {
public static String getPageId(String url) {
String pageId = "";
if (StringUtils.isBlank(url)) {
return pageId;
}
Pattern pat = Pattern.compile("topicId=[0-9]+");
Matcher matcher = pat.matcher(url);
if (matcher.find()) {
pageId = matcher.group().split("topicId=")[1];
}
return pageId;
}
}
ipparser代码
package com.task.ds.pro;
import com.task.ds.utils.IPParser;
import com.task.ds.utils.LogParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Map;
/**
* 省份浏览量统计
*/
public class ProvinceStatApp {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path outputPath = new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out");
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
}
Job job = Job.getInstance(configuration);
job.setJarByClass(ProvinceStatApp.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackinfo_20130721.txt"));
FileOutputFormat.setOutputPath(job, new Path("E:\\IdeaProject\\hadoop\\project\\src\\main\\java\\com\\data\\trackInfo_out"));
job.waitForCompletion(true);
}
static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private LongWritable ONE = new LongWritable(1);
private LogParser logParser;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
logParser = new LogParser();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String log = value.toString();
Map<String, String> info = logParser.parse(log);
String ip = info.get("ip");
if (StringUtils.isNotBlank(ip)) {
IPParser.RegionInfo regionInfo = IPParser.getInstance().analyseIp(ip);
if (regionInfo != null) {
String provine = regionInfo.getProvince();
if (StringUtils.isNotBlank(provine)) {
context.write(new Text(provine), ONE);
} else {
context.write(new Text("-"), ONE);
}
} else {
context.write(new Text("-"), ONE);
}
} else {
context.write(new Text("-"), ONE);
}
}
}
static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
System.out.println(context);
for (LongWritable value : values) {
count++;
}
context.write(key, new LongWritable(count));
}
}
}
ipseeker部分代码
package com.task.ds.utils;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
public class IPSeeker {
public static final String ERROR_RESULT = "错误的IP数据库文件";
// 一些固定常量,比如记录长度等等
private static final int IP_RECORD_LENGTH = 7;
private static final byte AREA_FOLLOWED = 0x01;
private static final byte NO_AREA = 0x2;
// 用来做为cache,查询一个ip时首先查看cache,以减少不必要的重复查找
private Hashtable ipCache;
// 随机文件访问类
private RandomAccessFile ipFile;
// 内存映射文件
private MappedByteBuffer mbb;
// 单一模式实例
private static IPSeeker instance = null;
// 起始地区的开始和结束的绝对偏移
private long ipBegin, ipEnd;
// 为提高效率而采用的临时变量
private IPLocation loc;
private byte[] buf;
private byte[] b4;
private byte[] b3;
/** */
/**
* 私有构造函数
*/
protected IPSeeker(String ipFilePath) {
ipCache = new Hashtable();
loc = new IPLocation();
buf = new byte[100];
b4 = new byte[4];
b3 = new byte[3];
try {
ipFile = new RandomAccessFile(ipFilePath, "r");
} catch (FileNotFoundException e) {
System.out.println("IP地址信息文件没有找到,IP显示功能将无法使用");
ipFile = null;
}
// 如果打开文件成功,读取文件头信息
if (ipFile != null) {
try {
ipBegin = readLong4(0);
ipEnd = readLong4(4);
if (ipBegin == -1 || ipEnd == -1) {
ipFile.close();
ipFile = null;
}
} catch (IOException e) {
System.out.println("IP地址信息文件格式有错误,IP显示功能将无法使用");
ipFile = null;
}
}
}
/** */
/**
* @return 单一实例
*/
public static IPSeeker getInstance(String ipFilePath) {
if (instance == null) {
instance = new IPSeeker(ipFilePath);
}
return instance;
}
/** */
/**
* 给定一个地点的不完全名字,得到一系列包含s子串的IP范围记录
*
* @param s
* 地点子串
* @return 包含IPEntry类型的List
*/
public List getIPEntriesDebug(String s) {
List ret = new ArrayList();
long endOffset = ipEnd + 4;
for (long offset = ipBegin + 4; offset <= endOffset; offset += IP_RECORD_LENGTH) {
// 读取结束IP偏移
long temp = readLong3(offset);
// 如果temp不等于-1,读取IP的地点信息
if (temp != -1) {
IPLocation loc = getIPLocation(temp);
// 判断是否这个地点里面包含了s子串,如果包含了,添加这个记录到List中,如果没有,继续
if (loc.country.indexOf(s) != -1 || loc.area.indexOf(s) != -1) {
IPEntry entry = new IPEntry();
entry.country = loc.country;
entry.area = loc.area;
// 得到起始IP
readIP(offset - 4, b4);
entry.beginIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 得到结束IP
readIP(temp, b4);
entry.endIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 添加该记录
ret.add(entry);
}
}
}
return ret;
}
/** */
/**
* 给定一个地点的不完全名字,得到一系列包含s子串的IP范围记录
*
* @param s
* 地点子串
* @return 包含IPEntry类型的List
*/
public List getIPEntries(String s) {
List ret = new ArrayList();
try {
// 映射IP信息文件到内存中
if (mbb == null) {
FileChannel fc = ipFile.getChannel();
mbb = fc.map(FileChannel.MapMode.READ_ONLY, 0, ipFile.length());
mbb.order(ByteOrder.LITTLE_ENDIAN);
}
int endOffset = (int) ipEnd;
for (int offset = (int) ipBegin + 4; offset <= endOffset; offset += IP_RECORD_LENGTH) {
int temp = readInt3(offset);
if (temp != -1) {
IPLocation loc = getIPLocation(temp);
// 判断是否这个地点里面包含了s子串,如果包含了,添加这个记录到List中,如果没有,继续
if (loc.country.indexOf(s) != -1 || loc.area.indexOf(s) != -1) {
IPEntry entry = new IPEntry();
entry.country = loc.country;
entry.area = loc.area;
// 得到起始IP
readIP(offset - 4, b4);
entry.beginIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 得到结束IP
readIP(temp, b4);
entry.endIp = IPSeekerUtils.getIpStringFromBytes(b4);
// 添加该记录
ret.add(entry);
}
}
}
} catch (IOException e) {
System.out.println(e.getMessage());
}
return ret;
}
/** */
/**
* 从内存映射文件的offset位置开始的3个字节读取一个int
*
* @param offset
* @return
*/
private int readInt3(int offset) {
mbb.position(offset);
return mbb.getInt() & 0x00FFFFFF;
}
/** */
/**
* 从内存映射文件的当前位置开始的3个字节读取一个int
*
* @return
*/
private int readInt3() {
return mbb.getInt() & 0x00FFFFFF;
}
logparser代码
package com.task.ds.utils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class LogParser {
private Logger logger = LoggerFactory.getLogger(LogParser.class);
public Map<String, String> parse(String log) {
Map<String, String> logInfo = new HashMap<String,String>();
IPParser ipParse = IPParser.getInstance();
if(StringUtils.isNotBlank(log)) {
String[] splits = log.split("\001");
String ip = splits[13];
String url = splits[1];
String sessionId = splits[10];
String time = splits[17];
logInfo.put("ip",ip);
logInfo.put("url",url);
logInfo.put("sessionId",sessionId);
logInfo.put("time",time);
IPParser.RegionInfo regionInfo = ipParse.analyseIp(ip);
logInfo.put("country",regionInfo.getCountry());
logInfo.put("province",regionInfo.getProvince());
logInfo.put("city",regionInfo.getCity());
} else{
logger.error("日志记录的格式不正确:" + log);
}
return logInfo;
}
}