统计各个省份的浏览量 (需要解析IP)
一、pandas是什么?
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
二、使用步骤
1.新建类
代码如下(示例):
ProvinceMapper
import com.baomidou.mybatisplus.core.toolkit.StringUtils;
import com.bigdata.hadoop.project.utils.IPParser;
import com.bigdata.hadoop.project.utils.LogParser;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Map;
public class ProvinceMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text city = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException, IOException {
// Split the input line into fields based on the delimiter
String[] fields = value.toString().split("\u0001");
if (fields.length > 13) {
// Assuming the IP address is in the 14th field (index 13)
String ip = fields[13];
String log = value.toString();
LogParser parser = new LogParser();
Map<String, String> logInfo = parser.parse(log);
if (StringUtils.isNotBlank(logInfo.get("ip"))) {
IPParser.RegionInfo regionInfo = IPParser.getInstance().analyseIp(logInfo.get("ip"));
String province = regionInfo.getProvince();
if (StringUtils.isNotBlank(province)) {
context.write(new Text(province), new IntWritable(1));
} else {
context.write(new Text("-"), new IntWritable(1));
}
} else {
context.write(new Text("+"), new IntWritable(1));
}
}
}
}
ProvinceReducer
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class ProvinceReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
ProvinceDriver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ProvinceDriver {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: ProvinceDriver <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Province View Count");
job.setJarByClass(ProvinceDriver.class);
job.setMapperClass(ProvinceMapper.class);
job.setCombinerClass(ProvinceReducer.class);
job.setReducerClass(ProvinceReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}