需求:
日志的ETL操作
(ETL:数据从来源端经过抽取(Extract)、转换(Transform)、加载(Load)至目的端的过程)
思路:
只需要解析出:ip、url、pageId(topicId对应的页面Id)、country、province、city
开发步骤:
Map
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 解析日志记录
Map<String, String> logInfo = logParser.parse(value.toString());
if (logInfo == null) {
logger.error("日志记录的格式不正确或解析失败:" + value.toString());
return;
}
// 获取需要的字段
String ip = logInfo.get("ip");
String url = logInfo.get("url");
String country = logInfo.get("country");
String province = logInfo.get("province");
String city = logInfo.get("city");
// 调用 GetPageId 获取 topicId
String topicId = GetPageId.getPageId(url);
logInfo.put("pageId", topicId);
// 检查所有字段是否全部为空
if (ip != null || url != null || topicId != null || country != null || province != null || city != null) {
StringBuilder sb = new StringBuilder();
if (ip != null && !ip.isEmpty()) sb.append("IP: ").append(ip).append(", ");
if (url != null && !url.isEmpty()) sb.append("URL: ").append(url).append(", ");
if (topicId != null && !topicId.isEmpty()) sb.append("PageId: ").append(topicId).append(", ");
if (country != null && !country.isEmpty()) sb.append("Country: ").append(country).append(", ");
if (province != null && !province.isEmpty()) sb.append("Province: ").append(province).append(", ");
if (city != null && !city.isEmpty()) sb.append("City: ").append(city);
// 移除末尾的逗号和空格
String outputString = sb.toString().replaceAll(", $", "");
outputKey.set(outputString);
context.write(outputKey, one);
} else {
logger.error("所有字段为空,日志记录:" + value.toString());
}
}
Reduce:
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}