软件环境:
storm1.1.0
使用一个600多兆的网站日志来模拟网站每天产生的日志信息
120.197.87.216 - - [04/Jan/2012:00:00:02 +0800] "GET /home.php?mod=space&uid=563413&mobile=yes HTTP/1.1" 200 3388 "-" "-"
123.126.50.73 - - [04/Jan/2012:00:00:02 +0800] "GET /thread-679411-1-1.html HTTP/1.1" 200 5251 "-" "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
203.208.60.187 - - [04/Jan/2012:00:00:02 +0800] "GET /archiver/tid-3003.html HTTP/1.1" 200 2056 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getgold HTTP/1.1" 200 13886 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getmedal HTTP/1.1" 200 13882 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.6.179.88 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=no HTTP/1.1" 200 172 "http://www.itpub.net/forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=yes" "Mozilla/5.0 (Linux; U; Android 2.2; zh-cn; ZTE-U V880 Build/FRF91) UC AppleWebKit/530+ (KHTML, like Gecko) Mobile Safari/530"
116.205.130.2 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=6 HTTP/1.1" 200 32 "http://www.itpub.net/forum-6-1.html?ts=28" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; QQDownload 702; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; AskTbPTV/5.11.3.15590; .NET4.0E)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=133 HTTP/1.1" 200 11 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?tid=1558574 HTTP/1.1" 200 5 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.75.173.35 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?goto=lastpost&mod=redirect&tid=1380214 HTTP/1.1" 302 5 "-" "Yahoo! Slurp China"
以上日志文件的样本,将文件上传到HDFS上的log目录下
使用开源库https://github.com/Gpwner/user-agent-utils解析出浏览器的名称
使用的时候,简单地调用一下API就能解析出浏览器的名称了
1.思路
首先通过HdfsSpout
从hdfs上读取日志文件
HdfsSpout hdfsSpout = new HdfsSpout()
.setReaderType("text")
.withOutputFields(TextFileReader.defaultFields)
.setHdfsUri(args[0])
.setSourceDir(args[1])
.setArchiveDir(args[2])
.setBadFilesDir(args[3]);
然后数据流入BrowserExtractBolt
public void execute(Tuple input) {
String log = input.getStringByField(field);
UserAgent agent = UserAgent.parseUserAgentString(log);
if (log != null) {
collector.emit(new Values(agent.getBrowser().getName()));
} else {
System.err.println("解析出现异常:" + log);
}
collector.ack(input);
}
接着与做词频统计类似:
public void execute(Tuple input, BasicOutputCollector collector) {
String browser = input.getStringByField("browser");
Integer count = counts.get(browser);
if (count == null)
count = 0;
count++;
counts.put(browser, count);
System.out.println(browser + " : " + count);
collector.emit(new Values(browser, count));
}
不清楚如何将hdfs与storm整合的看我之前的博客:
http://blog.youkuaiyun.com/gpwner/article/details/74157575
整个拓扑的构建:
2.实现
使用到的依赖
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<!--<scope>provided</scope>-->
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-hdfs</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>eu.bitwalker</groupId>
<artifactId>UserAgentUtils</artifactId>
<version>1.20</version>
</dependency>
</dependencies>
BrowserCountTopology
import neu.bolt.BrowserExtractBolt;
import neu.bolt.CountBolt;
import neu.bolt.ExtractBolt;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.hdfs.spout.HdfsSpout;
import org.apache.storm.hdfs.spout.TextFileReader;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import java.util.HashMap;
public class BrowserCountTopology {
public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "root");
if (args.length != 4) {
System.out.println("Usage <HdfsUri SourceDir ArchiveDir BadFilesDir>");
System.exit(1);
}
TopologyBuilder builder = new TopologyBuilder();
HdfsSpout hdfsSpout = new HdfsSpout()
.setReaderType("text")
.withOutputFields(TextFileReader.defaultFields)
.setHdfsUri(args[0])
.setSourceDir(args[1])
.setArchiveDir(args[2])
.setBadFilesDir(args[3]);
HashMap<String, Object> hashMap = new HashMap<>();
hashMap.put(ExtractBolt.FIELD, "line");
builder.setSpout("hdfsSpout", hdfsSpout, 1);
builder.setBolt("browserextractbolt", new BrowserExtractBolt(), 8)
.addConfigurations(hashMap).shuffleGrouping("hdfsSpout");
builder.setBolt("countBolt", new CountBolt(), 1)
.fieldsGrouping("browserextractbolt", new Fields("browser"));
Config conf = new Config();
conf.setDebug(true);
conf.setMaxTaskParallelism(1);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("CountTopology", conf, builder.createTopology());
Thread.sleep(90000);
cluster.shutdown();
}
}
BrowserExtractBolt
import eu.bitwalker.useragentutils.UserAgent;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.Map;
public class BrowserExtractBolt implements IRichBolt {
public static final String FIELD = "field";
String field;
OutputCollector collector;
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
this.field = (String) stormConf.get(FIELD);
}
public void execute(Tuple input) {
String log = input.getStringByField(field);
UserAgent agent = UserAgent.parseUserAgentString(log);
if (log != null) {
collector.emit(new Values(agent.getBrowser().getName()));
} else {
System.err.println("解析出现异常:" + log);
}
collector.ack(input);
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("browser"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
CountBolt
public class CountBolt extends BaseBasicBolt {
private Map<String, Integer> counts = new HashMap<>();
public void execute(Tuple input, BasicOutputCollector collector) {
String browser = input.getStringByField("browser");
Integer count = counts.get(browser);
if (count == null)
count = 0;
count++;
counts.put(browser, count);
System.out.println(browser + " : " + count);
collector.emit(new Values(browser, count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("browser", "count"));
}
}
参数配置:
运行结果: